|
import csv
|
|
import torch
|
|
from transformers import pipeline
|
|
|
|
|
|
chatbot = pipeline(
|
|
"text-generation",
|
|
model="mistralai/Mistral-7B-Instruct-v0.3",
|
|
torch_dtype=torch.float16,
|
|
device=0
|
|
)
|
|
|
|
|
|
sentiments = ["Positive", "Neutral", "Negative"]
|
|
|
|
|
|
formats = [
|
|
"Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
|
|
"Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
|
|
"Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
|
|
"Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
|
|
"Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
|
|
"eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
|
|
"Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
|
|
"Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
|
|
"Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
|
|
"Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
|
|
]
|
|
|
|
|
|
topics = [
|
|
"Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
|
|
"Education", "Environment", "Economics", "Culture", "History", "Music",
|
|
"Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
|
|
"Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
|
|
"Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
|
|
"Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
|
|
"Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
|
|
"Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
|
|
"Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
|
|
"Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
|
|
"Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
|
|
"Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
|
|
"Interior Design", "Architecture", "Urban Development", "Agriculture",
|
|
"Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
|
|
"Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
|
|
"Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
|
|
"Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
|
|
"Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
|
|
"Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
|
|
"Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
|
|
"Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
|
|
"Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
|
|
"Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
|
|
]
|
|
|
|
|
|
csv_file = "sentences.csv"
|
|
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
|
writer.writerow(["text", "label"])
|
|
|
|
|
|
def ensure_correct_quoting(text):
|
|
|
|
if text.startswith('"') and text.endswith('"'):
|
|
return text
|
|
else:
|
|
return f'"{text}"'
|
|
|
|
|
|
row_count = 0
|
|
format_index = 0
|
|
topic_index = 0
|
|
|
|
while row_count < 100000:
|
|
for idx, sentiment in enumerate(sentiments):
|
|
format_type = formats[format_index % len(formats)]
|
|
format_index += 1
|
|
topic = topics[topic_index % len(topics)]
|
|
topic_index += 1
|
|
|
|
|
|
prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}."
|
|
|
|
response = chatbot(prompt, max_new_tokens=100)
|
|
|
|
|
|
print(f"Full model response: {response}")
|
|
|
|
|
|
generated_text = response[0]['generated_text']
|
|
|
|
|
|
clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]
|
|
|
|
|
|
correctly_quoted_text = ensure_correct_quoting(clean_text)
|
|
|
|
|
|
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
|
writer.writerow([correctly_quoted_text, idx])
|
|
|
|
row_count += 1
|
|
print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")
|
|
|
|
if row_count >= 100000:
|
|
break
|
|
|
|
print("All responses saved. Total rows:", row_count)
|
|
|