Spaces:

TeresaK
/

cpv_test

Runtime error

App Files Files Community

leavoigt commited on Sep 22, 2023

Commit

9acfc38

•

1 Parent(s): 6234e7f

Delete utils/lexical_search.py

Browse files

Files changed (1) hide show

utils/lexical_search.py +0 -251

utils/lexical_search.py DELETED Viewed

@@ -1,251 +0,0 @@
-from haystack.nodes import TfidfRetriever
-from haystack.document_stores import InMemoryDocumentStore
-import spacy
-import re
-from spacy.matcher import Matcher
-from markdown import markdown
-from annotated_text import annotation
-from haystack.schema import Document
-from typing import List, Text, Tuple
-from typing_extensions import Literal
-from utils.preprocessing import processingpipeline
-from utils.streamlitcheck import check_streamlit
-import logging
-try:
-    from termcolor import colored
-except:
-    pass
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
-                        split_by: Literal["sentence", "word"] = 'word',
-                        split_length:int = 80, split_overlap:int = 0,
-                        remove_punc:bool = False,)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig. As lexical doesnt gets
-    affected by overlap, threfore split_overlap = 0 in default paramconfig and
-    split_by = word.
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use
-    st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    splititng of text.
-    removePunc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the lexicaal search using TFIDFRetriever we
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    lexical_processing_pipeline = processingpipeline()
-    output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap}})
-    return output_lexical_pre
-def tokenize_lexical_query(query:str)-> List[str]:
-    """
-    Removes the stop words from query and returns the list of important keywords
-    in query. For the lexical search the relevent paragraphs in document are
-    retreived using TfIDFretreiver from Haystack. However to highlight these
-    keywords we need the tokenized form of query.
-    Params
-    --------
-    query: string which represents either list of keywords user is looking for
-            or a query in form of Question.
-    Return
-    -----------
-    token_list: list of important keywords in the query.
-    """
-    nlp = spacy.load("en_core_web_sm")
-    token_list = [token.text.lower() for token in nlp(query)
-                  if not (token.is_stop or token.is_punct)]
-    return token_list
-def runSpacyMatcher(token_list:List[str], document:Text
-                    )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
-    """
-    Using the spacy in backend finds the keywords in the document using the
-    Matcher class from spacy. We can alternatively use the regex, but spacy
-    finds all keywords in serialized manner which helps in annotation of answers.
-    Params
-    -------
-    token_list: this is token list which tokenize_lexical_query function returns
-    document: text in which we need to find the tokens
-    Return
-    --------
-    matches: List of [start_index, end_index] in the spacydoc(at word level not
-    character) for the keywords in token list.
-    spacydoc: the keyword index in the spacydoc are at word level and not character,
-    therefore to allow the annotator to work seamlessly we return the spacydoc.
-    """
-    nlp = spacy.load("en_core_web_sm")
-    spacydoc = nlp(document)
-    matcher = Matcher(nlp.vocab)
-    token_pattern = [[{"LOWER":token}] for token in token_list]
-    matcher.add(",".join(token_list), token_pattern)
-    spacymatches = matcher(spacydoc)
-    # getting start and end index in spacydoc so that annotator can work seamlessly
-    matches = []
-    for match_id, start, end in spacymatches:
-        matches = matches + [[start, end]]
-    return matches, spacydoc
-def runRegexMatcher(token_list:List[str], document:Text):
-    """
-    Using the regex in backend finds the keywords in the document.
-    Params
-    -------
-    token_list: this is token list which tokenize_lexical_query function returns
-    document: text in which we need to find the tokens
-    Return
-    --------
-    matches: List of [start_index, end_index] in the document for the keywords
-    in token list at character level.
-    document: the keyword index returned by regex are at character level,
-    therefore to allow the annotator to work seamlessly we return the text back.
-    """
-    matches = []
-    for token in token_list:
-        matches = (matches +
-                  [[val.start(), val.start() +
-                  len(token)] for val in re.finditer(token, document)])
-    return matches, document
-def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
-    """
-    This is spacy Annotator and needs spacy.doc
-    Annotates the text in the document defined by list of [start index, end index]
-    Example: "How are you today", if document type is text, matches = [[0,3]]
-    will give answer = "How", however in case we used the spacy matcher then the
-    matches = [[0,3]] will give answer = "How are you". However if spacy is used
-    to find "How" then the matches = [[0,1]] for the string defined above.
-    Params
-    -----------
-    matches: As mentioned its list of list. Example [[0,1],[10,13]]
-    document: document which needs to be indexed.
-    Return
-    --------
-    will send the output to either app front end using streamlit or
-    write directly to output screen.
-    """
-    start = 0
-    annotated_text = ""
-    for match in matches:
-        start_idx = match[0]
-        end_idx = match[1]
-        if check_streamlit():
-            annotated_text = (annotated_text + document[start:start_idx].text
-                            + str(annotation(body=document[start_idx:end_idx].text,
-                            label="ANSWER", background="#964448", color='#ffffff')))
-        else:
-            annotated_text = (annotated_text + document[start:start_idx].text
-                            + colored(document[start_idx:end_idx].text,
-                          "green", attrs = ['bold']))
-        start = end_idx
-    annotated_text = annotated_text + document[end_idx:].text
-    if check_streamlit():
-        st.write(
-                markdown(annotated_text),
-                unsafe_allow_html=True,
-            )
-    else:
-        print(annotated_text)
-def lexical_search(query:Text, documents:List[Document],top_k:int):
-    """
-    Performs the Lexical search on the List of haystack documents which is
-    returned by preprocessing Pipeline.
-    Params
-    -------
-    query: Keywords that need to be searche in documents.
-    documents: List of Haystack documents returned by preprocessing pipeline.
-    top_k: Number of Top results to be fetched.
-    """
-    document_store = InMemoryDocumentStore()
-    document_store.write_documents(documents)
-    # Haystack Retriever works with document stores only.
-    retriever = TfidfRetriever(document_store)
-    results = retriever.retrieve(query=query, top_k = top_k)
-    query_tokens = tokenize_lexical_query(query)
-    flag = True
-    for count, result in enumerate(results):
-        matches, doc = runSpacyMatcher(query_tokens,result.content)
-        if len(matches) != 0:
-            if flag:
-                flag = False
-                if check_streamlit():
-                    st.markdown("##### Top few lexical search (TFIDF) hits #####")
-                else:
-                    print("Top few lexical search (TFIDF) hits")
-            if check_streamlit():
-                st.write("Result {}".format(count+1))
-            else:
-                print("Results {}".format(count +1))
-            spacyAnnotator(matches, doc)
-    if flag:
-        if check_streamlit():
-            st.info("🤔 No relevant result found. Please try another keyword.")
-        else:
-            print("No relevant result found. Please try another keyword.")