karthik1362
/

48-Laws-of-Power

Model card Files Files and versions Community

48-Laws-of-Power / pdf_chat.py

karthik1362's picture

Upload 2 files

c60e255 verified 11 months ago

history blame contribute delete

2.74 kB

	# -- coding: utf-8 --
	"""pdf chat.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1RXTs4FPcFCVb9_ZAWBBxLoYQEcKz37x9
	"""

	!pip install langchain
	!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents.
	!pip install openai
	!pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
	!pip install chromadb # the AI-native open-source embedding database
	!pip install Cython # Cython is an optimising static compiler for both the Python programming language
	!pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
	!pip install unstructured[local-inference]
	!CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
	!pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
	!pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
	!pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur.
	!pip install tiktoken
	!pip install --upgrade Pillow

	import os
	os.environ['OPENAI_API_KEY'] = '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'

	from langchain.document_loaders import UnstructuredPDFLoader
	from langchain.indexes import VectorstoreIndexCreator

	from detectron2.config import get_cfg
	cfg = get_cfg()
	cfg.MODEL.DEVICE = 'gpu' #GPU is recommended

	!wget https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf #meta earnings; replace with any pdf

	!mkdir docs
	!mv 48lawsofpower.pdf docs

	text_folder = 'docs'
	loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]

	!apt-get install poppler-utils # error occurs without this, pdf rendering library

	index = VectorstoreIndexCreator().from_loaders(loaders)

	query = "Can you give me an example from history where the enemy was crushed totally from the book?"
	index.query(query)

	query = "What's the point of making myself less accessible?"
	index.query(query)

	query = "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"
	index.query(query)

	query = "State the names of 5 laws?"
	index.query(query)