Spaces:
Building
Building
Update app.py
Browse files
app.py
CHANGED
@@ -8,11 +8,89 @@ from huggingface_hub import InferenceClient
|
|
8 |
API_KEY = os.getenv("SERPHOUSE_API_KEY")
|
9 |
hf_client = InferenceClient("CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN"))
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
COUNTRY_LANGUAGES = {
|
13 |
"South Korea": "ko",
|
14 |
"Japan": "ja",
|
15 |
"China": "zh",
|
|
|
16 |
"Russia": "ru",
|
17 |
"France": "fr",
|
18 |
"Germany": "de",
|
@@ -27,93 +105,46 @@ COUNTRY_LANGUAGES = {
|
|
27 |
"Saudi Arabia": "ar",
|
28 |
"United Arab Emirates": "ar",
|
29 |
"Egypt": "ar",
|
30 |
-
"Morocco": "ar"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
|
33 |
-
|
34 |
-
"United States": "United States",
|
35 |
-
"United Kingdom": "United Kingdom",
|
36 |
-
"Canada": "Canada",
|
37 |
-
"Australia": "Australia",
|
38 |
-
"Germany": "Germany",
|
39 |
-
"France": "France",
|
40 |
-
"Japan": "Japan",
|
41 |
-
"South Korea": "South Korea",
|
42 |
-
"China": "China",
|
43 |
-
"India": "India",
|
44 |
-
"Brazil": "Brazil",
|
45 |
-
"Mexico": "Mexico",
|
46 |
-
"Russia": "Russia",
|
47 |
-
"Italy": "Italy",
|
48 |
-
"Spain": "Spain",
|
49 |
-
"Netherlands": "Netherlands",
|
50 |
-
"Singapore": "Singapore",
|
51 |
-
"Hong Kong": "Hong Kong",
|
52 |
-
"Indonesia": "Indonesia",
|
53 |
-
"Malaysia": "Malaysia",
|
54 |
-
"Philippines": "Philippines",
|
55 |
-
"Thailand": "Thailand",
|
56 |
-
"Vietnam": "Vietnam",
|
57 |
-
"Belgium": "Belgium",
|
58 |
-
"Denmark": "Denmark",
|
59 |
-
"Finland": "Finland",
|
60 |
-
"Ireland": "Ireland",
|
61 |
-
"Norway": "Norway",
|
62 |
-
"Poland": "Poland",
|
63 |
-
"Sweden": "Sweden",
|
64 |
-
"Switzerland": "Switzerland",
|
65 |
-
"Austria": "Austria",
|
66 |
-
"Czech Republic": "Czech Republic",
|
67 |
-
"Greece": "Greece",
|
68 |
-
"Hungary": "Hungary",
|
69 |
-
"Portugal": "Portugal",
|
70 |
-
"Romania": "Romania",
|
71 |
-
"Turkey": "Turkey",
|
72 |
-
"Israel": "Israel",
|
73 |
-
"Saudi Arabia": "Saudi Arabia",
|
74 |
-
"United Arab Emirates": "United Arab Emirates",
|
75 |
-
"South Africa": "South Africa",
|
76 |
-
"Argentina": "Argentina",
|
77 |
-
"Chile": "Chile",
|
78 |
-
"Colombia": "Colombia",
|
79 |
-
"Peru": "Peru",
|
80 |
-
"Venezuela": "Venezuela",
|
81 |
-
"New Zealand": "New Zealand",
|
82 |
-
"Bangladesh": "Bangladesh",
|
83 |
-
"Pakistan": "Pakistan",
|
84 |
-
"Egypt": "Egypt",
|
85 |
-
"Morocco": "Morocco",
|
86 |
-
"Nigeria": "Nigeria",
|
87 |
-
"Kenya": "Kenya",
|
88 |
-
"Ukraine": "Ukraine",
|
89 |
-
"Croatia": "Croatia",
|
90 |
-
"Slovakia": "Slovakia",
|
91 |
-
"Bulgaria": "Bulgaria",
|
92 |
-
"Serbia": "Serbia",
|
93 |
-
"Estonia": "Estonia",
|
94 |
-
"Latvia": "Latvia",
|
95 |
-
"Lithuania": "Lithuania",
|
96 |
-
"Slovenia": "Slovenia",
|
97 |
-
"Luxembourg": "Luxembourg",
|
98 |
-
"Malta": "Malta",
|
99 |
-
"Cyprus": "Cyprus",
|
100 |
-
"Iceland": "Iceland"
|
101 |
-
}
|
102 |
-
|
103 |
-
MAJOR_COUNTRIES = list(COUNTRY_LOCATIONS.keys())
|
104 |
-
|
105 |
-
def is_english(text):
|
106 |
-
# ์์ด๋ก๋ง ๊ตฌ์ฑ๋์ด ์๋์ง ํ์ธ
|
107 |
-
return all(ord(char) < 128 for char in text.replace(' ', ''))
|
108 |
-
|
109 |
def translate_query(query, country):
|
110 |
try:
|
111 |
-
# ์์ด ์
๋ ฅ์ธ ๊ฒฝ์ฐ ๋ฒ์ญํ์ง ์๊ณ ๊ทธ๋๋ก ์ฌ์ฉ
|
112 |
if is_english(query):
|
113 |
print(f"English query detected, using original: {query}")
|
114 |
return query[:255]
|
115 |
|
116 |
-
# ํ๊ธ ์
๋ ฅ์ด๊ณ South Korea๊ฐ ์ ํ๋ ๊ฒฝ์ฐ
|
117 |
if country == "South Korea":
|
118 |
return query[:255]
|
119 |
|
@@ -124,7 +155,8 @@ def translate_query(query, country):
|
|
124 |
# ๋ฒ์ญ ํ๋กฌํํธ ๊ฐ์
|
125 |
prompt = f"""Translate this text to {target_lang} language.
|
126 |
For Japanese, use Kanji and Kana.
|
127 |
-
For Chinese, use Simplified Chinese.
|
|
|
128 |
For Korean, use Hangul.
|
129 |
Only output the translated text without any explanation.
|
130 |
Text to translate: {query}"""
|
@@ -145,6 +177,10 @@ def translate_query(query, country):
|
|
145 |
|
146 |
|
147 |
|
|
|
|
|
|
|
|
|
148 |
def search_serphouse(query, country, page=1, num_result=10):
|
149 |
url = "https://api.serphouse.com/serp/live"
|
150 |
|
@@ -157,12 +193,12 @@ def search_serphouse(query, country, page=1, num_result=10):
|
|
157 |
"data": {
|
158 |
"q": translated_query,
|
159 |
"domain": "google.com",
|
160 |
-
"
|
161 |
"lang": COUNTRY_LANGUAGES.get(country, "en"),
|
162 |
"device": "desktop",
|
163 |
"serp_type": "news",
|
164 |
"page": "1",
|
165 |
-
"num": "10"
|
166 |
}
|
167 |
}
|
168 |
|
|
|
8 |
API_KEY = os.getenv("SERPHOUSE_API_KEY")
|
9 |
hf_client = InferenceClient("CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN"))
|
10 |
|
11 |
+
|
12 |
+
|
13 |
+
COUNTRY_CODES = {
|
14 |
+
"United States": "US",
|
15 |
+
"United Kingdom": "GB",
|
16 |
+
"Canada": "CA",
|
17 |
+
"Australia": "AU",
|
18 |
+
"Germany": "DE",
|
19 |
+
"France": "FR",
|
20 |
+
"Japan": "JP",
|
21 |
+
"South Korea": "KR",
|
22 |
+
"China": "CN",
|
23 |
+
"Taiwan": "TW", # ๋๋ง ์ถ๊ฐ
|
24 |
+
"India": "IN",
|
25 |
+
"Brazil": "BR",
|
26 |
+
"Mexico": "MX",
|
27 |
+
"Russia": "RU",
|
28 |
+
"Italy": "IT",
|
29 |
+
"Spain": "ES",
|
30 |
+
"Netherlands": "NL",
|
31 |
+
"Singapore": "SG",
|
32 |
+
"Hong Kong": "HK",
|
33 |
+
"Indonesia": "ID",
|
34 |
+
"Malaysia": "MY",
|
35 |
+
"Philippines": "PH",
|
36 |
+
"Thailand": "TH",
|
37 |
+
"Vietnam": "VN",
|
38 |
+
"Belgium": "BE",
|
39 |
+
"Denmark": "DK",
|
40 |
+
"Finland": "FI",
|
41 |
+
"Ireland": "IE",
|
42 |
+
"Norway": "NO",
|
43 |
+
"Poland": "PL",
|
44 |
+
"Sweden": "SE",
|
45 |
+
"Switzerland": "CH",
|
46 |
+
"Austria": "AT",
|
47 |
+
"Czech Republic": "CZ",
|
48 |
+
"Greece": "GR",
|
49 |
+
"Hungary": "HU",
|
50 |
+
"Portugal": "PT",
|
51 |
+
"Romania": "RO",
|
52 |
+
"Turkey": "TR",
|
53 |
+
"Israel": "IL",
|
54 |
+
"Saudi Arabia": "SA",
|
55 |
+
"United Arab Emirates": "AE",
|
56 |
+
"South Africa": "ZA",
|
57 |
+
"Argentina": "AR",
|
58 |
+
"Chile": "CL",
|
59 |
+
"Colombia": "CO",
|
60 |
+
"Peru": "PE",
|
61 |
+
"Venezuela": "VE",
|
62 |
+
"New Zealand": "NZ",
|
63 |
+
"Bangladesh": "BD",
|
64 |
+
"Pakistan": "PK",
|
65 |
+
"Egypt": "EG",
|
66 |
+
"Morocco": "MA",
|
67 |
+
"Nigeria": "NG",
|
68 |
+
"Kenya": "KE",
|
69 |
+
"Ukraine": "UA",
|
70 |
+
"Croatia": "HR",
|
71 |
+
"Slovakia": "SK",
|
72 |
+
"Bulgaria": "BG",
|
73 |
+
"Serbia": "RS",
|
74 |
+
"Estonia": "EE",
|
75 |
+
"Latvia": "LV",
|
76 |
+
"Lithuania": "LT",
|
77 |
+
"Slovenia": "SI",
|
78 |
+
"Luxembourg": "LU",
|
79 |
+
"Malta": "MT",
|
80 |
+
"Cyprus": "CY",
|
81 |
+
"Iceland": "IS"
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
def is_english(text):
|
86 |
+
# ์์ด๋ก๋ง ๊ตฌ์ฑ๋์ด ์๋์ง ํ์ธ
|
87 |
+
return all(ord(char) < 128 for char in text.replace(' ', ''))
|
88 |
+
|
89 |
COUNTRY_LANGUAGES = {
|
90 |
"South Korea": "ko",
|
91 |
"Japan": "ja",
|
92 |
"China": "zh",
|
93 |
+
"Taiwan": "zh-tw", # ๋๋ง์ด(๋ฒ์ฒด ์ค๊ตญ์ด) ์ถ๊ฐ
|
94 |
"Russia": "ru",
|
95 |
"France": "fr",
|
96 |
"Germany": "de",
|
|
|
105 |
"Saudi Arabia": "ar",
|
106 |
"United Arab Emirates": "ar",
|
107 |
"Egypt": "ar",
|
108 |
+
"Morocco": "ar",
|
109 |
+
"Greece": "el",
|
110 |
+
"Poland": "pl",
|
111 |
+
"Czech Republic": "cs",
|
112 |
+
"Hungary": "hu",
|
113 |
+
"Turkey": "tr",
|
114 |
+
"Romania": "ro",
|
115 |
+
"Bulgaria": "bg",
|
116 |
+
"Croatia": "hr",
|
117 |
+
"Serbia": "sr",
|
118 |
+
"Slovakia": "sk",
|
119 |
+
"Slovenia": "sl",
|
120 |
+
"Estonia": "et",
|
121 |
+
"Latvia": "lv",
|
122 |
+
"Lithuania": "lt",
|
123 |
+
"Ukraine": "uk",
|
124 |
+
"Israel": "he",
|
125 |
+
"Bangladesh": "bn",
|
126 |
+
"Pakistan": "ur",
|
127 |
+
"Finland": "fi",
|
128 |
+
"Denmark": "da",
|
129 |
+
"Norway": "no",
|
130 |
+
"Sweden": "sv",
|
131 |
+
"Iceland": "is",
|
132 |
+
"Philippines": "fil",
|
133 |
+
"Brazil": "pt-br",
|
134 |
+
"Argentina": "es-ar",
|
135 |
+
"Chile": "es-cl",
|
136 |
+
"Colombia": "es-co",
|
137 |
+
"Peru": "es-pe",
|
138 |
+
"Venezuela": "es-ve"
|
139 |
}
|
140 |
|
141 |
+
# ๋ฒ์ญ ํ๋กฌํํธ ์์
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def translate_query(query, country):
|
143 |
try:
|
|
|
144 |
if is_english(query):
|
145 |
print(f"English query detected, using original: {query}")
|
146 |
return query[:255]
|
147 |
|
|
|
148 |
if country == "South Korea":
|
149 |
return query[:255]
|
150 |
|
|
|
155 |
# ๋ฒ์ญ ํ๋กฌํํธ ๊ฐ์
|
156 |
prompt = f"""Translate this text to {target_lang} language.
|
157 |
For Japanese, use Kanji and Kana.
|
158 |
+
For Chinese (China), use Simplified Chinese.
|
159 |
+
For Chinese (Taiwan), use Traditional Chinese.
|
160 |
For Korean, use Hangul.
|
161 |
Only output the translated text without any explanation.
|
162 |
Text to translate: {query}"""
|
|
|
177 |
|
178 |
|
179 |
|
180 |
+
# MAJOR_COUNTRIES ์ ์ ์์
|
181 |
+
MAJOR_COUNTRIES = list(COUNTRY_CODES.keys()) # COUNTRY_LOCATIONS ๋์ COUNTRY_CODES ์ฌ์ฉ
|
182 |
+
|
183 |
+
# search_serphouse ํจ์ ์์
|
184 |
def search_serphouse(query, country, page=1, num_result=10):
|
185 |
url = "https://api.serphouse.com/serp/live"
|
186 |
|
|
|
193 |
"data": {
|
194 |
"q": translated_query,
|
195 |
"domain": "google.com",
|
196 |
+
"country_code": COUNTRY_CODES.get(country, "US"), # country_code ์ฌ์ฉ
|
197 |
"lang": COUNTRY_LANGUAGES.get(country, "en"),
|
198 |
"device": "desktop",
|
199 |
"serp_type": "news",
|
200 |
"page": "1",
|
201 |
+
"num": "10"
|
202 |
}
|
203 |
}
|
204 |
|