karthik1362
commited on
Commit
•
c60e255
1
Parent(s):
c6639dd
Upload 2 files
Browse files- pdf_chat.ipynb +0 -0
- pdf_chat.py +58 -0
pdf_chat.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pdf_chat.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""pdf chat.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1RXTs4FPcFCVb9_ZAWBBxLoYQEcKz37x9
|
8 |
+
"""
|
9 |
+
|
10 |
+
!pip install langchain
|
11 |
+
!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents.
|
12 |
+
!pip install openai
|
13 |
+
!pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
|
14 |
+
!pip install chromadb # the AI-native open-source embedding database
|
15 |
+
!pip install Cython # Cython is an optimising static compiler for both the Python programming language
|
16 |
+
!pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
|
17 |
+
!pip install unstructured[local-inference]
|
18 |
+
!CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
|
19 |
+
!pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
|
20 |
+
!pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
|
21 |
+
!pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur.
|
22 |
+
!pip install tiktoken
|
23 |
+
!pip install --upgrade Pillow
|
24 |
+
|
25 |
+
import os
|
26 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
27 |
+
|
28 |
+
from langchain.document_loaders import UnstructuredPDFLoader
|
29 |
+
from langchain.indexes import VectorstoreIndexCreator
|
30 |
+
|
31 |
+
from detectron2.config import get_cfg
|
32 |
+
cfg = get_cfg()
|
33 |
+
cfg.MODEL.DEVICE = 'gpu' #GPU is recommended
|
34 |
+
|
35 |
+
!wget https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf #meta earnings; replace with any pdf
|
36 |
+
|
37 |
+
!mkdir docs
|
38 |
+
!mv 48lawsofpower.pdf docs
|
39 |
+
|
40 |
+
text_folder = 'docs'
|
41 |
+
loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]
|
42 |
+
|
43 |
+
!apt-get install poppler-utils # error occurs without this, pdf rendering library
|
44 |
+
|
45 |
+
index = VectorstoreIndexCreator().from_loaders(loaders)
|
46 |
+
|
47 |
+
query = "Can you give me an example from history where the enemy was crushed totally from the book?"
|
48 |
+
index.query(query)
|
49 |
+
|
50 |
+
query = "What's the point of making myself less accessible?"
|
51 |
+
index.query(query)
|
52 |
+
|
53 |
+
query = "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"
|
54 |
+
index.query(query)
|
55 |
+
|
56 |
+
query = "State the names of 5 laws?"
|
57 |
+
index.query(query)
|
58 |
+
|