Spaces:
Sleeping
Sleeping
jobanpreet123
commited on
Commit
•
11dec1a
1
Parent(s):
2f84696
scrapping code changed
Browse files- __pycache__/advance_post.cpython-310.pyc +0 -0
- __pycache__/paraphrase_post.cpython-310.pyc +0 -0
- __pycache__/scrap_post.cpython-310.pyc +0 -0
- advance_post.py +3 -3
- app.py +1 -1
- paraphrase_post.py +1 -1
- scrap_post.py +10 -30
__pycache__/advance_post.cpython-310.pyc
ADDED
Binary file (3.46 kB). View file
|
|
__pycache__/paraphrase_post.cpython-310.pyc
ADDED
Binary file (2.96 kB). View file
|
|
__pycache__/scrap_post.cpython-310.pyc
ADDED
Binary file (717 Bytes). View file
|
|
advance_post.py
CHANGED
@@ -10,7 +10,7 @@ import nest_asyncio
|
|
10 |
def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
|
11 |
|
12 |
response_schemas = [
|
13 |
-
ResponseSchema(name="
|
14 |
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
15 |
format_instructions = output_parser.get_format_instructions()
|
16 |
|
@@ -29,7 +29,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
|
|
29 |
|
30 |
chain = prompt | model | output_parser
|
31 |
result=chain.invoke({"post": linkedin_post})
|
32 |
-
questions=result['
|
33 |
# print(questions)
|
34 |
|
35 |
all_links = []
|
@@ -61,7 +61,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
|
|
61 |
# result=chain.invoke({'post':linkedinpost , 'content':docs})
|
62 |
# return result , docs
|
63 |
|
64 |
-
|
65 |
def advanced_post(all_links ,model ,linkedinpost):
|
66 |
loader = WebBaseLoader(all_links,encoding="utf-8")
|
67 |
loader.requests_per_second = 1
|
|
|
10 |
def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
|
11 |
|
12 |
response_schemas = [
|
13 |
+
ResponseSchema(name="questions", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
|
14 |
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
15 |
format_instructions = output_parser.get_format_instructions()
|
16 |
|
|
|
29 |
|
30 |
chain = prompt | model | output_parser
|
31 |
result=chain.invoke({"post": linkedin_post})
|
32 |
+
questions=result['questions']
|
33 |
# print(questions)
|
34 |
|
35 |
all_links = []
|
|
|
61 |
# result=chain.invoke({'post':linkedinpost , 'content':docs})
|
62 |
# return result , docs
|
63 |
|
64 |
+
nest_asyncio.apply()
|
65 |
def advanced_post(all_links ,model ,linkedinpost):
|
66 |
loader = WebBaseLoader(all_links,encoding="utf-8")
|
67 |
loader.requests_per_second = 1
|
app.py
CHANGED
@@ -3,7 +3,7 @@ import re
|
|
3 |
import openai
|
4 |
from paraphrase_post import get_original_url , paraphrased_post
|
5 |
from advance_post import google_search , advanced_post
|
6 |
-
from
|
7 |
from langchain_groq import ChatGroq
|
8 |
#from langchain import HuggingFaceHub
|
9 |
|
|
|
3 |
import openai
|
4 |
from paraphrase_post import get_original_url , paraphrased_post
|
5 |
from advance_post import google_search , advanced_post
|
6 |
+
from langchain_community.chat_models import ChatOpenAI
|
7 |
from langchain_groq import ChatGroq
|
8 |
#from langchain import HuggingFaceHub
|
9 |
|
paraphrase_post.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from langchain_community.document_loaders import WebBaseLoader
|
2 |
from langchain.prompts import ChatPromptTemplate
|
3 |
from langchain.output_parsers import ResponseSchema
|
4 |
from langchain.output_parsers import StructuredOutputParser
|
@@ -45,6 +44,7 @@ def get_original_url(url):
|
|
45 |
def paraphrased_post(url,model):
|
46 |
|
47 |
post=scrappost(url)
|
|
|
48 |
|
49 |
template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
|
50 |
{data}"""
|
|
|
|
|
1 |
from langchain.prompts import ChatPromptTemplate
|
2 |
from langchain.output_parsers import ResponseSchema
|
3 |
from langchain.output_parsers import StructuredOutputParser
|
|
|
44 |
def paraphrased_post(url,model):
|
45 |
|
46 |
post=scrappost(url)
|
47 |
+
print(post)
|
48 |
|
49 |
template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
|
50 |
{data}"""
|
scrap_post.py
CHANGED
@@ -1,33 +1,13 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
|
5 |
|
6 |
def scrappost(url):
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
for element in glossary_page:
|
15 |
-
if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
|
16 |
-
# If there's already content in the group, add it to all_groups
|
17 |
-
if group['page_content']:
|
18 |
-
all_groups.append(group)
|
19 |
-
group = {'page_content': ''}
|
20 |
-
group['page_content'] += element.text
|
21 |
-
if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
|
22 |
-
group['page_content'] += element.text
|
23 |
-
|
24 |
-
if "unstructured.documents.html.HTMLListItem" in str(type(element)):
|
25 |
-
group['page_content']+=element.text
|
26 |
-
|
27 |
-
# # Add the last group if it exists
|
28 |
-
if group['page_content']:
|
29 |
-
all_groups.append(group)
|
30 |
-
|
31 |
-
# Print the groups
|
32 |
-
for group in all_groups[:1]:
|
33 |
-
return group["page_content"]
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
|
5 |
|
6 |
def scrappost(url):
|
7 |
+
response = requests.get(url)
|
8 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
9 |
+
span_tags = soup.find_all('script',type="application/ld+json")
|
10 |
+
content_list = [tag.get_text() for tag in span_tags]
|
11 |
+
for content in content_list:
|
12 |
+
data=json.loads(content)['articleBody']
|
13 |
+
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|