Spaces:

seanpedrickcase
/

document_rag_preparation

Running

File size: 7,201 Bytes

0b2c988

import re
import string
import polars as pl
import gradio as gr
import time
from datetime import datetime
import tools.anonymiser as anon
from unstructured.staging.base import convert_to_dataframe

from typing import List
from unstructured.documents.elements import Element

from tools.unstructured_funcs import export_elements_as_table_to_file

today_rev = datetime.now().strftime("%Y%m%d")

chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']

# Adding custom words to the stopwords
custom_words = []
my_stop_words = custom_words

# #### Some of my cleaning functions
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
email_pattern_regex = r'\S*@\S*\s?'
num_pattern_regex = r'[0-9]+'
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
multiple_spaces_regex = r'\s{2,}'

def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean_text:str, data_file_name_no_ext:str="combined_elements", anonymise_drop:List[str]="No", anon_strat:str = "redact", anon_entities:List[str]=chosen_redact_entities, progress=gr.Progress(track_tqdm=True)):
    '''
    Clean open text in tabular format with custom regex or anonymisation.
    '''
    
    output_text = ""
    output_list = []

    progress(0, desc = "Cleaning data")

    if not in_colnames:
        error_message = "Please enter one column name to use for cleaning and finding topics."
        print(error_message)
        return error_message, None, data_file_name_no_ext, None, None

    all_tic = time.perf_counter()

    output_list = []
    #file_list = [string.name for string in in_files]

    in_colnames_list_first = in_colnames[0]

    if clean_text == "Yes":
        clean_tic = time.perf_counter()
        print("Starting data clean.")

        for element in data:
            if not custom_regex.empty:
                cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
            else:
                cleaned_data = initial_clean([element.text], [])

            element.text = cleaned_data[0]
            print(element.text)

        clean_toc = time.perf_counter()
        clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
        print(clean_time_out)

    if anonymise_drop == "Yes":
        progress(0.6, desc= "Anonymising data")

        data_file_name_no_ext = data_file_name_no_ext + "_anon"

        anon_tic = time.perf_counter()

        data_list = []

        for element in data:
            data_list.append(element.text)
    
        data_anon_col, anonymisation_success = anon.anonymise_script(data_list, anon_strat=anon_strat)

        for i, element in enumerate(data):
            element.text = data_anon_col[i]

        print(anonymisation_success)

        anon_toc = time.perf_counter()
        time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"

    alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(data, data_file_name_no_ext, file_name_suffix="_clean")

    all_toc = time.perf_counter()
    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
    print(time_out)

    output_text = "Data clean completed."
    
    return output_text, out_files, data, output_file_base


def initial_clean(texts, custom_regex, progress=gr.Progress()):
    #texts = pl.Series(texts).str.strip_chars()
    #text = texts.str.replace_all(html_pattern_regex, ' ')
    #text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
    #text = text.str.replace_all(email_pattern_regex, ' ')
    #text = text.str.replace_all(nums_two_more_regex, ' ')
    #text = text.str.replace_all(postcode_pattern_regex, ' ')

    texts = pl.Series(texts)

    # Allow for custom regex patterns to be removed
    if len(custom_regex) > 0:
        for pattern in custom_regex:
            raw_string_pattern = rf"{pattern}"  # Case-insensitive regex
            #print(f"Removing regex pattern: {raw_string_pattern}")
            text = text.str.replace_all(raw_string_pattern, " ")
            #print("Text without pattern: ", text[0])
    

    #text = text.str.replace_all(multiple_spaces_regex, ' ')

    text = text.to_list()
    
    return text

def remove_hyphens(text_text):
    return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)


def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_short_tokens(tokens):
    return [token for token in tokens if len(token) > 3]


def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
   # Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
    # Only identifies the second duplicate

    seen = set()
    dups = []

    for i, doi in enumerate(data_samples_ready):
        if doi not in seen:
            seen.add(doi)
        else:
            dups.append(i) 
    #data_samples_ready[dupes[0:]]
    
    # To see a specific duplicated value you know the position of
    #matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
    #matching
    
    # Remove duplicates only (keep first instance)
    #data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
    
    ### Remove all duplicates including original instance
    
    # Identify ALL duplicates including initial values
    # https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python

    from collections import defaultdict
    D = defaultdict(list)
    for i,item in enumerate(data_samples_ready):
        D[item].append(i)
    D = {k:v for k,v in D.items() if len(v)>1}
    
    # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
    L = list(D.values())
    flat_list_dups = [item for sublist in L for item in sublist]

    # https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
    for index in sorted(flat_list_dups, reverse=True):
        del data_samples_ready[index]
        del data_samples_clean[index]
        del data_samples[index]
    
    # Remove blanks
    data_samples_ready = [i for i in data_samples_ready if i]
    data_samples_clean = [i for i in data_samples_clean if i]
    data_samples = [i for i in data_samples if i]
    
    return data_samples_ready, data_samples_clean, flat_list_dups, data_samples