GEETHANAYAGI's picture
Upload 79 files
f9d7028 verified
import os
import sys
import glob
import requests
from urllib.parse import urlencode
from dotenv import dotenv_values
import traceback
import time
flores_to_iso = {
"asm_Beng": "as",
"ben_Beng": "bn",
"brx_Deva": "brx",
"doi_Deva": "doi",
"eng_Latn": "en",
"gom_Deva": "gom",
"guj_Gujr": "gu",
"hin_Deva": "hi",
"kan_Knda": "kn",
"kas_Arab": "ks",
"kas_Deva": "ks_Deva",
"mai_Deva": "mai",
"mal_Mlym": "ml",
"mar_Deva": "mr",
"mni_Beng": "mni_Beng",
"mni_Mtei": "mni",
"npi_Deva": "ne",
"ory_Orya": "or",
"pan_Guru": "pa",
"san_Deva": "sa",
"sat_Olck": "sat",
"snd_Arab": "sd",
"snd_Deva": "sd_Deva",
"tam_Taml": "ta",
"tel_Telu": "te",
"urd_Arab": "ur",
}
class AzureTranslator:
def __init__(
self,
subscription_key: str,
region: str,
endpoint: str = "https://api.cognitive.microsofttranslator.com",
) -> None:
self.http_headers = {
"Ocp-Apim-Subscription-Key": subscription_key,
"Ocp-Apim-Subscription-Region": region,
}
self.translate_endpoint = endpoint + "/translate?api-version=3.0&"
self.languages_endpoint = endpoint + "/languages?api-version=3.0"
self.supported_languages = self.get_supported_languages()
def get_supported_languages(self) -> dict:
return requests.get(self.languages_endpoint).json()["translation"]
def batch_translate(self, texts: list, src_lang: str, tgt_lang: str) -> list:
if not texts:
return texts
src_lang = flores_to_iso[src_lang]
tgt_lang = flores_to_iso[tgt_lang]
if src_lang not in self.supported_languages:
raise NotImplementedError(
f"Source language code: `{src_lang}` not supported!"
)
if tgt_lang not in self.supported_languages:
raise NotImplementedError(
f"Target language code: `{tgt_lang}` not supported!"
)
body = [{"text": text} for text in texts]
query_string = urlencode(
{
"from": src_lang,
"to": tgt_lang,
}
)
try:
response = requests.post(
self.translate_endpoint + query_string,
headers=self.http_headers,
json=body,
)
except:
traceback.print_exc()
return None
try:
response = response.json()
except:
traceback.print_exc()
print("Response:", response.text)
return None
return [payload["translations"][0]["text"] for payload in response]
def text_translate(self, text: str, src_lang: str, tgt_lang: str) -> str:
return self.batch_translate([text], src_lang, tgt_lang)[0]
if __name__ == "__main__":
root_dir = sys.argv[1]
# Expects a .env file containing the API credentials.
config = dotenv_values(os.path.join(os.path.dirname(__file__), ".env"))
t = AzureTranslator(
config["AZURE_TRANSLATOR_TEXT_SUBSCRIPTION_KEY"],
config["AZURE_TRANSLATOR_TEXT_REGION"],
config["AZURE_TRANSLATOR_TEXT_ENDPOINT"],
)
pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
for i, pair in enumerate(pairs):
basename = os.path.basename(pair)
print(pair)
src_lang, tgt_lang = basename.split("-")
print(f"{src_lang} - {tgt_lang}")
# source to target translations
src_infname = os.path.join(pair, f"test.{src_lang}")
tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.azure")
if not os.path.exists(src_infname):
continue
src_sents = [
sent.replace("\n", "").strip()
for sent in open(src_infname, "r").read().split("\n")
if sent
]
if not os.path.exists(tgt_outfname):
try:
translations = []
for i in range(0, len(src_sents), 128):
start, end = i, int(min(i + 128, len(src_sents)))
translations.extend(
t.batch_translate(src_sents[start:end], src_lang, tgt_lang)
)
with open(tgt_outfname, "w") as f:
f.write("\n".join(translations))
time.sleep(10)
except Exception as e:
print(e)
continue
# target to source translations
tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
src_outfname = os.path.join(pair, f"test.{src_lang}.pred.azure")
if not os.path.exists(tgt_infname):
continue
tgt_sents = [
sent.replace("\n", "").strip()
for sent in open(tgt_infname, "r").read().split("\n")
if sent
]
if not os.path.exists(src_outfname):
try:
translations = []
for i in range(0, len(tgt_sents), 128):
start, end = i, int(min(i + 128, len(tgt_sents)))
translations.extend(
t.batch_translate(tgt_sents[start:end], tgt_lang, src_lang)
)
with open(src_outfname, "w") as f:
f.write("\n".join(translations))
except Exception as e:
continue
time.sleep(10)