Spaces:
Runtime error
Runtime error
| """Compute text statistics for a document.""" | |
| from typing import TYPE_CHECKING, Iterable, Optional, cast | |
| from typing_extensions import override | |
| from ..schema import Field, Item, RichData, field | |
| from ..signal import TextSignal | |
| from ..utils import chunks | |
| SPACY_LANG_MODEL = 'en_core_web_sm' | |
| SPACY_BATCH_SIZE = 128 | |
| NUM_CHARS = 'num_characters' | |
| READABILITY = 'readability' | |
| TYPE_TOKEN_RATIO = 'log(type_token_ratio)' | |
| FRAC_NON_ASCII = 'frac_non_ascii' | |
| if TYPE_CHECKING: | |
| from spacy import Language | |
| from spacy.tokens import Doc | |
| class TextStatisticsSignal(TextSignal): | |
| """Compute text statistics for a document such as readability scores, type-token-ratio, etc..""" | |
| name = 'text_statistics' | |
| display_name = 'Text Statistics' | |
| _lang: Optional['Language'] = None | |
| def fields(self) -> Field: | |
| return field( | |
| fields={ | |
| NUM_CHARS: 'int32', | |
| READABILITY: 'float32', | |
| TYPE_TOKEN_RATIO: 'float32', | |
| FRAC_NON_ASCII: field( | |
| 'float32', bins=[('Low', None, 0.15), ('Medium', 0.15, 0.3), ('High', 0.3, None)]) | |
| }) | |
| def setup(self) -> None: | |
| try: | |
| import spacy | |
| import spacy.cli | |
| import spacy.util | |
| except ImportError: | |
| raise ImportError('Could not import the "spacy" python package. ' | |
| 'Please install it with `pip install spacy`.') | |
| if not spacy.util.is_package(SPACY_LANG_MODEL): | |
| spacy.cli.download(SPACY_LANG_MODEL) | |
| self._lang = spacy.load( | |
| SPACY_LANG_MODEL, | |
| disable=[ | |
| 'parser', 'tagger', 'ner', 'lemmatizer', 'textcat', 'custom', 'tok2vec', 'attribute_ruler' | |
| ]) | |
| def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]: | |
| try: | |
| import textacy.corpus | |
| from textacy import text_stats | |
| except ImportError: | |
| raise ImportError('Could not import the "textacy" python package. ' | |
| 'Please install it with `pip install textacy`.') | |
| if not self._lang: | |
| raise RuntimeError('Language model was not loaded.') | |
| data = cast(Iterable[str], data) | |
| for batch in chunks(data, SPACY_BATCH_SIZE): | |
| # Replace None with empty strings to avoid spacy errors. | |
| batch = [x or '' for x in batch] | |
| # See https://textacy.readthedocs.io/en/0.11.0/api_reference/text_stats.html for a list of | |
| # available statistics. | |
| corpus = textacy.corpus.Corpus(lang=self._lang, data=batch) | |
| for doc in cast(Iterable['Doc'], corpus): | |
| if not doc or not doc.text.strip(): | |
| yield None | |
| continue | |
| try: | |
| readability = text_stats.readability.automated_readability_index(doc) | |
| except ZeroDivisionError: | |
| readability = None | |
| try: | |
| ttr = text_stats.diversity.log_ttr(doc) | |
| except ValueError: | |
| ttr = None | |
| num_chars = len(doc.text) | |
| num_non_ascii = 0 | |
| for c in doc.text: | |
| if ord(c) >= 128: | |
| num_non_ascii += 1 | |
| frac_non_ascii = num_non_ascii / num_chars if num_chars else 0 | |
| yield { | |
| NUM_CHARS: num_chars, | |
| READABILITY: readability, | |
| TYPE_TOKEN_RATIO: ttr, | |
| FRAC_NON_ASCII: frac_non_ascii | |
| } | |