autoquizzer / backend /custom_components.py
anakin87's picture
first commit
8bd40a9
raw
history blame
No virus
3.86 kB
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from haystack import Document, component, logging
from haystack.components.converters.utils import (
get_bytestream_from_source,
normalize_metadata,
)
from haystack.dataclasses import ByteStream
from trafilatura import extract
import json
import json_repair
logger = logging.getLogger(__name__)
@component
class TrafilaturaHTMLConverter:
"""
Converts an HTML file to a Document using Trafilatura.
Usage example:
```python
converter = TrafilaturaHTMLConverter()
results = converter.run(sources=["path/to/sample.html"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the HTML file.'
```
"""
@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of HTML files to Documents.
:param sources:
List of HTML file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:param extract_kwargs:
Additional keyword arguments to pass to the Trafilatura `extract` method.
See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
logger.warning(
"Could not read {source}. Skipping it. Error: {error}",
source=source,
error=e,
)
continue
text = None
try:
text = extract(bytestream.data.decode("utf-8"))
except Exception as conversion_e:
logger.warning(
"Failed to extract text from {source}. Error: {error}",
source=source,
error=conversion_e,
)
continue
document = Document(content=text, meta={**bytestream.meta, **metadata})
documents.append(document)
return {"documents": documents}
@component
class QuizParser:
@component.output_types(quiz=Dict)
def run(self, replies: List[str]):
reply = replies[0]
# even if prompted to respond with JSON only, sometimes the model returns a mix of JSON and text
first_index = min(reply.find("{"), reply.find("["))
last_index = max(reply.rfind("}"), reply.rfind("]")) + 1
json_portion = reply[first_index:last_index]
try:
quiz = json.loads(json_portion)
except json.JSONDecodeError:
# if the JSON is not well-formed, try to repair it
quiz = json_repair.loads(json_portion)
# sometimes the JSON contains a list instead of a dictionary
if isinstance(quiz, list):
quiz = quiz[0]
print(quiz)
return {"quiz": quiz}