|
from pathlib import Path |
|
from typing import Any, Dict, List, Optional, Union |
|
|
|
from haystack import Document, component, logging |
|
from haystack.components.converters.utils import ( |
|
get_bytestream_from_source, |
|
normalize_metadata, |
|
) |
|
from haystack.dataclasses import ByteStream |
|
|
|
from trafilatura import extract |
|
|
|
import json |
|
import json_repair |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@component |
|
class TrafilaturaHTMLConverter: |
|
""" |
|
Converts an HTML file to a Document using Trafilatura. |
|
|
|
Usage example: |
|
```python |
|
converter = TrafilaturaHTMLConverter() |
|
results = converter.run(sources=["path/to/sample.html"]) |
|
documents = results["documents"] |
|
print(documents[0].content) |
|
# 'This is a text from the HTML file.' |
|
``` |
|
""" |
|
|
|
@component.output_types(documents=List[Document]) |
|
def run( |
|
self, |
|
sources: List[Union[str, Path, ByteStream]], |
|
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, |
|
): |
|
""" |
|
Converts a list of HTML files to Documents. |
|
|
|
:param sources: |
|
List of HTML file paths or ByteStream objects. |
|
:param meta: |
|
Optional metadata to attach to the Documents. |
|
This value can be either a list of dictionaries or a single dictionary. |
|
If it's a single dictionary, its content is added to the metadata of all produced Documents. |
|
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. |
|
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. |
|
:param extract_kwargs: |
|
Additional keyword arguments to pass to the Trafilatura `extract` method. |
|
See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information. |
|
|
|
:returns: |
|
A dictionary with the following keys: |
|
- `documents`: Created Documents |
|
""" |
|
|
|
documents = [] |
|
meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) |
|
|
|
for source, metadata in zip(sources, meta_list): |
|
try: |
|
bytestream = get_bytestream_from_source(source=source) |
|
except Exception as e: |
|
logger.warning( |
|
"Could not read {source}. Skipping it. Error: {error}", |
|
source=source, |
|
error=e, |
|
) |
|
continue |
|
|
|
text = None |
|
try: |
|
text = extract(bytestream.data.decode("utf-8")) |
|
except Exception as conversion_e: |
|
logger.warning( |
|
"Failed to extract text from {source}. Error: {error}", |
|
source=source, |
|
error=conversion_e, |
|
) |
|
continue |
|
|
|
document = Document(content=text, meta={**bytestream.meta, **metadata}) |
|
documents.append(document) |
|
|
|
return {"documents": documents} |
|
|
|
|
|
@component |
|
class QuizParser: |
|
@component.output_types(quiz=Dict) |
|
def run(self, replies: List[str]): |
|
reply = replies[0] |
|
|
|
|
|
first_index = min(reply.find("{"), reply.find("[")) |
|
last_index = max(reply.rfind("}"), reply.rfind("]")) + 1 |
|
|
|
json_portion = reply[first_index:last_index] |
|
|
|
try: |
|
quiz = json.loads(json_portion) |
|
except json.JSONDecodeError: |
|
|
|
quiz = json_repair.loads(json_portion) |
|
|
|
|
|
if isinstance(quiz, list): |
|
quiz = quiz[0] |
|
|
|
print(quiz) |
|
|
|
return {"quiz": quiz} |
|
|