Spaces:

IronOne-AI-Labs
/

Annual_Report_Summarization_Demo

Sleeping

App Files Files Community

RMWeerasinghe commited on May 22, 2024

Commit

99e744f

1 Parent(s): 0528be1

Initial Commit

Browse files

Files changed (10) hide show

.gitignore +6 -1
app.py +255 -68
config.py +5 -0
mapReduceSummarizer.py +50 -0
model.py +43 -0
preprocess.py +33 -0
refineSummarizer.py +41 -0
requirements.txt +0 -0
summarizer.py +72 -0
utils.py +6 -7

.gitignore CHANGED Viewed

@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -142,4 +143,8 @@ Docs/
 .DS_Store
 .vscode/
 test.ipynb
-test.py

 .installed.cfg
 *.egg
 MANIFEST
+.conda
 # PyInstaller
 #  Usually these files are written by a python script from a template
 .DS_Store
 .vscode/
 test.ipynb
+test.py
+requirements1.txt
+#logs
+logs/

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import nltk
 import validators
 import streamlit as st
-from transformers import AutoTokenizer, pipeline
-# local modules
-from extractive_summarizer.model_processors import Summarizer
 from utils import (
     clean_text,
     fetch_article_text,
@@ -12,20 +15,69 @@ from utils import (
     read_text_from_file,
 )
 from rouge import Rouge
-if __name__ == "__main__":
-    # ---------------------------------
-    # Main Application
-    # ---------------------------------
     st.title("Text Summarizer 📝")
-    st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
-    st.markdown(
-        "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
     )
-    summarize_type = st.sidebar.selectbox(
-        "Summarization type", options=["Extractive", "Abstractive"]
     )
     st.markdown(
@@ -44,15 +96,8 @@ if __name__ == "__main__":
     )
     st.markdown("---")
     # ---------------------------
-    # SETUP & Constants
-    nltk.download("punkt")
-    abs_tokenizer_name = "facebook/bart-large-cnn"
-    abs_model_name = "facebook/bart-large-cnn"
-    abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
-    abs_max_length = 90
-    abs_min_length = 30
-    # ---------------------------
     inp_text = st.text_input("Enter text or a url here")
     st.markdown(
         "<h3 style='text-align: center; color: green;'>OR</h3>",
@@ -65,11 +110,14 @@ if __name__ == "__main__":
     is_url = validators.url(inp_text)
     if is_url:
         # complete text, chunks to summarize (list of sentences for long docs)
         text, cleaned_txt = fetch_article_text(url=inp_text)
     elif uploaded_file:
         cleaned_txt = read_text_from_file(uploaded_file)
         cleaned_txt = clean_text(cleaned_txt)
     else:
         cleaned_txt = clean_text(inp_text)
     # view summarized text (expander)
@@ -80,51 +128,190 @@ if __name__ == "__main__":
             st.write(cleaned_txt)
     summarize = st.button("Summarize")
-    # called on toggle button [summarize]
-    if summarize:
-        if summarize_type == "Extractive":
-            if is_url:
-                text_to_summarize = " ".join([txt for txt in cleaned_txt])
-            else:
-                text_to_summarize = cleaned_txt
-            # extractive summarizer
-            with st.spinner(
-                text="Creating extractive summary. This might take a few seconds ..."
-            ):
-                ext_model = Summarizer()
-                summarized_text = ext_model(text_to_summarize, num_sentences=5)
-        elif summarize_type == "Abstractive":
-            with st.spinner(
-                text="Creating abstractive summary. This might take a few seconds ..."
-            ):
-                text_to_summarize = cleaned_txt
-                abs_summarizer = pipeline(
-                    "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
-                )
-                if is_url is False:
-                    # list of chunks
-                    text_to_summarize = preprocess_text_for_abstractive_summarization(
-                        tokenizer=abs_tokenizer, text=cleaned_txt
-                    )
-                tmp_sum = abs_summarizer(
-                    text_to_summarize,
-                    max_length=abs_max_length,
-                    min_length=abs_min_length,
-                    do_sample=False,
-                )
-                summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
-        # final summarized output
-        st.subheader("Summarized text")
-        st.info(summarized_text)
-        st.subheader("Rogue Scores")
-        rouge_sc = Rouge()
-        ground_truth = cleaned_txt[0] if is_url else cleaned_txt
-        score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
-        st.code(score)

+import datetime
+import logging
 import nltk
 import validators
 import streamlit as st
+from summarizer import Summarizer
+from config import MODELS
+from warnings import filterwarnings
+filterwarnings("ignore")
 from utils import (
     clean_text,
     fetch_article_text,
     read_text_from_file,
 )
 from rouge import Rouge
+def filer():
+    # return "logs/log "
+    today = datetime.datetime.today()
+    log_filename = f"logs/{today.year}-{today.month:02d}-{today.day:02d}.log"
+    return log_filename
+file_handler = logging.FileHandler(filer())
+# file_handler = logging.handlers.TimedRotatingFileHandler(filer(),when="D")
+file_handler.setLevel(logging.INFO)
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s %(levelname)s (%(name)s) : %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[file_handler],
+    force=True,
+)
+logger = logging.getLogger(__name__)
+if "api_key" not in st.session_state:
+    st.session_state.api_key = " "
+@st.cache_resource
+def initialize_app():
+    nltk.download("punkt")
+@st.cache_resource
+def init_summarizer(model_name,api_key=None):
+    model_type = "local"
+    if model_name == "OpenAI":
+        model_type = "openai"
+    model_path = MODELS[model_name]
+    if model_type == "openai":
+        #validation logic
+        return Summarizer(model_path,model_type,api_key)
+    else:
+        logger.info(f"Model for summarization : {model_path}")
+        return Summarizer(model_path, model_type)
+def load_app():
     st.title("Text Summarizer 📝")
+    # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
+    # st.markdown(
+    #     "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
+    # )
+    model_name = st.sidebar.selectbox(
+        "Model Name", options=["Version 0", "Version 1","OpenAI"]
     )
+    if model_name == "OpenAI":
+        st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
+    summarizer_type = st.sidebar.selectbox(
+        "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
     )
     st.markdown(
     )
     st.markdown("---")
     # ---------------------------
+    # ---------------------------
     inp_text = st.text_input("Enter text or a url here")
     st.markdown(
         "<h3 style='text-align: center; color: green;'>OR</h3>",
     is_url = validators.url(inp_text)
     if is_url:
         # complete text, chunks to summarize (list of sentences for long docs)
+        logger.info("Text Input Type: URL")
         text, cleaned_txt = fetch_article_text(url=inp_text)
     elif uploaded_file:
+        logger.info("Text Input Type: FILE")
         cleaned_txt = read_text_from_file(uploaded_file)
         cleaned_txt = clean_text(cleaned_txt)
     else:
+        logger.info("Text Input Type: INPUT TEXT")
         cleaned_txt = clean_text(inp_text)
     # view summarized text (expander)
             st.write(cleaned_txt)
     summarize = st.button("Summarize")
+    if is_url:
+        text_to_summarize = " ".join([txt for txt in cleaned_txt])
+    else:
+        text_to_summarize = cleaned_txt
+    return text_to_summarize, model_name, summarizer_type, summarize
+def get_summary(text_to_summarize,model_name, summarizer_type, summarize):
+    while not summarize:
+        continue
+    else:
+        logger.info(f"Model Name: {model_name}")
+        logger.info(f"Summarization Type for Long Text: {summarizer_type}")
+        api_key = st.session_state.api_key
+        summarizer = init_summarizer(model_name,api_key)
+        with st.spinner(
+            text="Creating summary. This might take a few seconds ..."
+        ):
+            if summarizer_type == "Refine":
+                summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
+                return summarized_text, time
+            else :
+                summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
+                return summarized_text, time
+def display_output(summarized_text,time):
+    logger.info(f"SUMMARY: {summarized_text}")
+    logger.info(f"Summary took {time}s")
+    st.subheader("Summarized text")
+    st.info(f"{summarized_text}")
+    st.info(f"Time: {time}s")
+# def summarizer_app():
+#     # ---------------------------------
+#     # Main Application
+#     # ---------------------------------
+#     st.title("Text Summarizer 📝")
+#     # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
+#     # st.markdown(
+#     #     "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
+#     # )
+#     model_name = st.sidebar.selectbox(
+#         "Model Name", options=["Version 0", "Version 1","OpenAI"]
+#     )
+#     if model_name == "OpenAI":
+#         st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
+#     summarizer_type = st.sidebar.selectbox(
+#         "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
+#     )
+#     st.markdown(
+#         "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
+#     )
+#     st.markdown(
+#         """- Raw text in text box
+# - URL of article/news to be summarized
+# - .txt, .pdf, .docx file formats"""
+#     )
+#     st.markdown(
+#         """This app supports two type of summarization:
+# 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
+# 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
+#     )
+#     st.markdown("---")
+#     # ---------------------------
+#     # SETUP & Constants
+#     # nltk.download("punkt")
+#     # abs_tokenizer_name = "facebook/bart-large-cnn"
+#     # abs_model_name = "facebook/bart-large-cnn"
+#     # abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
+#     # abs_max_length = 90
+#     # abs_min_length = 30
+#     # model_name_v0 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0"
+#     # model_name_v1 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1"
+#     # ---------------------------
+#     inp_text = st.text_input("Enter text or a url here")
+#     st.markdown(
+#         "<h3 style='text-align: center; color: green;'>OR</h3>",
+#         unsafe_allow_html=True,
+#     )
+#     uploaded_file = st.file_uploader(
+#         "Upload a .txt, .pdf, .docx file for summarization"
+#     )
+#     is_url = validators.url(inp_text)
+#     if is_url:
+#         # complete text, chunks to summarize (list of sentences for long docs)
+#         logger.info("Text Input Type: URL")
+#         text, cleaned_txt = fetch_article_text(url=inp_text)
+#     elif uploaded_file:
+#         logger.info("Text Input Type: FILE")
+#         cleaned_txt = read_text_from_file(uploaded_file)
+#         cleaned_txt = clean_text(cleaned_txt)
+#     else:
+#         logger.info("Text Input Type: INPUT TEXT")
+#         cleaned_txt = clean_text(inp_text)
+#     # view summarized text (expander)
+#     with st.expander("View input text"):
+#         if is_url:
+#             st.write(cleaned_txt[0])
+#         else:
+#             st.write(cleaned_txt)
+#     summarize = st.button("Summarize")
+#     # called on toggle button [summarize]
+#     if summarize:
+#         if is_url:
+#             text_to_summarize = " ".join([txt for txt in cleaned_txt])
+#         else:
+#             text_to_summarize = cleaned_txt
+#         logger.info(f"Model Name: {model_name}")
+#         logger.info(f"Summarization Type for Long Text: {summarizer_type}")
+#         api_key = st.session_state.api_key
+#         print(api_key)
+#         summarizer = init_summarizer(model_name,api_key)
+#         with st.spinner(
+#             text="Creating summary. This might take a few seconds ..."
+#         ):
+#                 #ext_model = Summarizer()
+#                 #summarized_text = ext_model(text_to_summarize, num_sentences=5)
+#             if summarizer_type == "Refine":
+#                 summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
+#             else :
+#                 summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
+#         # elif model_name == "Version 1":
+#         #     with st.spinner(
+#         #         text="Creating summary. This might take a few seconds ..."
+#         #     ):
+#         #         if summarizer_type == "Refine":
+#         #             summarized_text, time = summarizer_v1.summarize(text_to_summarize,"refine")
+#         #         else :
+#         #             summarized_text, time = summarizer_v1.summarize(text_to_summarize,"map_reduce")
+#         # final summarized output
+#         logger.info(f"SUMMARY: {summarized_text}")
+#         logger.info(f"Summary took {time}s")
+#         st.subheader("Summarized text")
+#         st.info(f"{summarized_text}")
+#         st.info(f"Time: {time}s")
+#         # st.subheader("Rogue Scores")
+#         # rouge_sc = Rouge()
+#         # ground_truth = cleaned_txt[0] if is_url else cleaned_txt
+#         # score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
+#         # st.code(score)
+if __name__ == "__main__":
+    initialize_app()
+    text_to_summarize, model_name, summarizer_type, summarize = load_app()
+    summarized_text,time = get_summary(text_to_summarize, model_name, summarizer_type, summarize)
+    display_output(summarized_text,time)

config.py ADDED Viewed

	@@ -0,0 +1,5 @@

+MODELS = {
+    "Version 0":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0",
+    "Version 1":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1",
+    "OpenAI" : "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1" #for tokenizer
+}

mapReduceSummarizer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, LLMChain, StuffDocumentsChain
+from langchain.prompts import PromptTemplate
+def get_map_reduce_chain(pipeline_or_llm,model_type)-> LLMChain:
+    if model_type == "openai":
+        llm = pipeline_or_llm
+        map_template  = """The following is a set of documents
+            {docs}
+            Based on this list of docs, please identify the main themes.
+            Helpful Answer:"""
+        map_prompt = PromptTemplate.from_template(map_template)
+        reduce_template = """The following is set of summaries:
+        {docs}
+        Take these and distill into a final, consolidated summary of the main themes.
+        Helpful Answer:"""
+        reduce_prompt = PromptTemplate.from_template(reduce_template)
+    else:
+        map_prompt = PromptTemplate.from_template(template="{docs}")
+        reduce_prompt = PromptTemplate.from_template(template="{docs}")
+        llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
+    map_chain = LLMChain(llm = llm, prompt=map_prompt)
+    reduce_chain = LLMChain(llm = llm, prompt = reduce_prompt,verbose = True)
+    combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="docs")
+    reduce_documents_chain = ReduceDocumentsChain(
+        combine_documents_chain=combine_documents_chain,
+        collapse_documents_chain=combine_documents_chain,
+        token_max=16384,
+        verbose = True,
+    )
+    map_reduce_chain = MapReduceDocumentsChain(
+        llm_chain=map_chain,
+        reduce_documents_chain=reduce_documents_chain,
+        document_variable_name="docs",
+        return_intermediate_steps=False,
+        verbose = True,
+    )
+    return map_reduce_chain

model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
+from langchain_openai import OpenAI
+from huggingface_hub import login
+from dotenv import load_dotenv
+from logging import getLogger
+import streamlit as st
+import torch
+# load_dotenv()
+# hf_token = os.environ.get("HF_TOKEN")
+hf_token = st.secrets["HF_TOKEN"]
+login(token=hf_token)
+logger = getLogger(__name__)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def get_local_model(model_name_or_path:str)->pipeline:
+    #print(f"Model is running on {device}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+    pipe = pipeline(
+        task = 'summarization',
+        model=model,
+        tokenizer=tokenizer,
+        device = device,
+    )
+    logger.info(f"Summarization pipeline created and loaded to {device}")
+    return pipe
+def get_endpoint(api_key:str):
+    llm = OpenAI(openai_api_key=api_key)
+    return llm
+def get_model(model_type,model_name_or_path,api_key = None):
+    if model_type == "openai":
+        return get_endpoint(api_key)
+    else:
+        return get_local_model(model_name_or_path)

preprocess.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from logging import getLogger
+logger = getLogger(__name__)
+def get_input_token_count(text:str,tokenizer)->int:
+    tokens = tokenizer.tokenize(text)
+    return len(tokens)
+def get_document_splits_from_text(text:str) -> Document:
+    document = Document(page_content=text)
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n\n","\n",".","?"," "],
+        chunk_size=15000,
+        chunk_overlap = 50
+    )
+    split_documents = text_splitter.split_documents([document])
+    logger.info(f"Splitting Document: Total Chunks: {len(split_documents)} ")
+    return split_documents
+def prepare_for_summarize(text:str,tokenizer):
+    no_input_tokens = get_input_token_count(text,tokenizer)
+    if no_input_tokens<12000:
+        text_to_summarize = text
+        length_type = "short"
+        return text_to_summarize,length_type
+    else:
+        text_to_summarize = get_document_splits_from_text(text)
+        length_type = "long"
+        return text_to_summarize, length_type

refineSummarizer.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.chains.summarize import load_summarize_chain
+from langchain.prompts import PromptTemplate
+def get_refine_chain(pipeline_or_llm, model_type):
+    if model_type == "openai":
+        llm = pipeline_or_llm
+        question_template  = """Write a concise summary of the following:
+                        {text}
+                        CONCISE SUMMARY:"""
+        question_prompt = PromptTemplate.from_template(question_template)
+        refine_template =  """Your job is to produce a final summary
+                We have provided an existing summary up to a certain point: {existing_answer}
+                We have the opportunity to refine the existing summary (only if needed) with some more context below.
+                ------------
+                {text}
+                ------------
+                Given the new context, refine the original summary in bullets. If the context isn't useful return the original summary."""
+        refine_prompt = PromptTemplate.from_template(refine_template)
+    else:
+        question_prompt = PromptTemplate.from_template(template="{text}")
+        refine_prompt = PromptTemplate.from_template(template= "{existing_answer}\n{text}")
+        llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
+    refine_chain = load_summarize_chain(
+        llm=llm,
+        chain_type="refine",
+        question_prompt=question_prompt,
+        refine_prompt=refine_prompt,
+        return_intermediate_steps=False,
+        input_key="input_documents",
+        output_key="output_text",
+        verbose=True,
+    )
+    return refine_chain

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

summarizer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from model import get_model
+from mapReduceSummarizer import get_map_reduce_chain
+from refineSummarizer import get_refine_chain
+from preprocess import prepare_for_summarize
+from transformers import AutoTokenizer
+from langchain.prompts import PromptTemplate
+from logging import getLogger
+import time
+logger = getLogger(__name__)
+class Summarizer:
+    def __init__(self,model_name,model_type,api_key=None) -> None:
+        self.model_type = model_type
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.base_summarizer = get_model(model_type,model_name,api_key)
+    def summarize(self,text:str,summarizer_type = "map_reduce")->str:
+        text_to_summarize,length_type = prepare_for_summarize(text,self.tokenizer)
+        if length_type =="short":
+            logger.info("Processing Input Text less than 12000 Tokens")
+            if self.model_type=="openai":
+                llm = self.base_summarizer
+                prompt = PromptTemplate.from_template(
+                    template="""Write a concise and complete summary in bullet points of the given annual report.
+                        Important:
+                        * Note that the summary should contain all important information and it should not contain any unwanted information.
+                        * Make sure to keep the summary as short as possible. And Summary should be in bullet points. Seperate each point with a new line.
+                        TEXT: {text}
+                        SUMMARY:"""
+                )
+                llm_chain = prompt|llm
+                start = time.time()
+                summary =  llm_chain.invoke({"text": text_to_summarize})
+                end = time.time()
+                print(f"Summary generation took {round((end-start),2)}s.")
+                return summary,round((end-start),2)
+            elif self.model_type == "local":
+                pipe = self.base_summarizer
+                start = time.time()
+                summary = pipe(text_to_summarize)[0]['summary_text']
+                end = time.time()
+                print(f"Summary generation took {round((end-start),2)}s.")
+                return summary,round((end-start),2)
+        else:
+            if summarizer_type == "refine":
+                print("The text is too long, Running Refine Summarizer")
+                llm_chain = get_refine_chain(self.base_summarizer,self.model_type)
+                logger.info("Running Refine Chain for Summarization")
+                start = time.time()
+                summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
+                end = time.time()
+                print(f"Summary generation took {round((end-start),2)}s.")
+                return summary,round((end-start),2)
+            else:
+                print("The text is too long, Running Map Reduce Summarizer")
+                llm_chain = get_map_reduce_chain(self.base_summarizer,model_type=self.model_type)
+                logger.info("Running Map Reduce Chain for Summarization")
+                start = time.time()
+                summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
+                end = time.time()
+                print(f"Summary generation took {round((end-start),2)}s.")
+                return summary,round((end-start),2)

utils.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 import requests
 import docx2txt
 from io import StringIO
-from PyPDF2 import PdfFileReader
 from bs4 import BeautifulSoup
 from nltk.tokenize import sent_tokenize
@@ -31,7 +31,8 @@ def clean_text(x):
     # x = re.sub(r"\w*\d+\w*", "", x)  # numbers
     x = re.sub(r"\s{2,}", " ", x)  # over spaces
     x = emoji_pattern.sub(r"", x)  # emojis
-    x = re.sub("[^.,!?A-Za-z0-9]+", " ", x)  # special charachters except .,!?
     return x
@@ -103,12 +104,10 @@ def preprocess_text_for_abstractive_summarization(tokenizer, text):
 def read_pdf(file):
-    pdfReader = PdfFileReader(file)
-    count = pdfReader.numPages
     all_page_text = ""
-    for i in range(count):
-        page = pdfReader.getPage(i)
-        all_page_text += page.extractText()
     return all_page_text

 import requests
 import docx2txt
 from io import StringIO
+from PyPDF2 import PdfReader
 from bs4 import BeautifulSoup
 from nltk.tokenize import sent_tokenize
     # x = re.sub(r"\w*\d+\w*", "", x)  # numbers
     x = re.sub(r"\s{2,}", " ", x)  # over spaces
     x = emoji_pattern.sub(r"", x)  # emojis
+    x = x.replace("$","Dollars ")
+    x = re.sub("[^.,!?%A-Za-z0-9]+", " ", x)  # special charachters except .,!?
     return x
 def read_pdf(file):
+    pdfReader = PdfReader(file)
     all_page_text = ""
+    for page in pdfReader.pages:
+        all_page_text += page.extract_text()
     return all_page_text