linkedin-post-generator / preprocess.py
DrishtiSharma's picture
Upload 6 files
765a4ee verified
raw
history blame
3.43 kB
import json
from llm_helper import llm
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
def process_posts(raw_file_path, processed_file_path=None):
with open(raw_file_path, encoding='utf-8') as file:
posts = json.load(file)
enriched_posts = []
for post in posts:
metadata = extract_metadata(post['text'])
post_with_metadata = post | metadata
enriched_posts.append(post_with_metadata)
unified_tags = get_unified_tags(enriched_posts)
for post in enriched_posts:
current_tags = post['tags']
new_tags = {unified_tags[tag] for tag in current_tags}
post['tags'] = list(new_tags)
with open(processed_file_path, encoding='utf-8', mode="w") as outfile:
json.dump(enriched_posts, outfile, indent=4)
def extract_metadata(post):
template = '''
You are given a LinkedIn post. You need to extract number of lines, language of the post and tags.
1. Return a valid JSON. No preamble.
2. JSON object should have exactly three keys: line_count, language and tags.
3. tags is an array of text tags. Extract maximum two tags.
4. Language should be English or Hinglish (Hinglish means hindi + english)
Here is the actual post on which you need to perform this task:
{post}
'''
pt = PromptTemplate.from_template(template)
chain = pt | llm
response = chain.invoke(input={"post": post})
try:
json_parser = JsonOutputParser()
res = json_parser.parse(response.content)
except OutputParserException:
raise OutputParserException("Context too big. Unable to parse jobs.")
return res
def get_unified_tags(posts_with_metadata):
unique_tags = set()
# Loop through each post and extract the tags
for post in posts_with_metadata:
unique_tags.update(post['tags']) # Add the tags to the set
unique_tags_list = ','.join(unique_tags)
template = '''I will give you a list of tags. You need to unify tags with the following requirements,
1. Tags are unified and merged to create a shorter list.
Example 1: "Jobseekers", "Job Hunting" can be all merged into a single tag "Job Search".
Example 2: "Motivation", "Inspiration", "Drive" can be mapped to "Motivation"
Example 3: "Personal Growth", "Personal Development", "Self Improvement" can be mapped to "Self Improvement"
Example 4: "Scam Alert", "Job Scam" etc. can be mapped to "Scams"
2. Each tag should be follow title case convention. example: "Motivation", "Job Search"
3. Output should be a JSON object, No preamble
3. Output should have mapping of original tag and the unified tag.
For example: {{"Jobseekers": "Job Search", "Job Hunting": "Job Search", "Motivation": "Motivation}}
Here is the list of tags:
{tags}
'''
pt = PromptTemplate.from_template(template)
chain = pt | llm
response = chain.invoke(input={"tags": str(unique_tags_list)})
try:
json_parser = JsonOutputParser()
res = json_parser.parse(response.content)
except OutputParserException:
raise OutputParserException("Context too big. Unable to parse jobs.")
return res
if __name__ == "__main__":
process_posts("data/raw_posts.json", "data/processed_posts.json")