GEETHANAYAGI's picture
Upload 79 files
f9d7028 verified
import os
import sys
import glob
from tqdm import tqdm
from google.cloud import translate
# Expects a json file containing the API credentials.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(
os.path.dirname(__file__), r"api_key.json"
)
flores_to_iso = {
"asm_Beng": "as",
"ben_Beng": "bn",
"doi_Deva": "doi",
"eng_Latn": "en",
"gom_Deva": "gom",
"guj_Gujr": "gu",
"hin_Deva": "hi",
"kan_Knda": "kn",
"mai_Deva": "mai",
"mal_Mlym": "ml",
"mar_Deva": "mr",
"mni_Mtei": "mni_Mtei",
"npi_Deva": "ne",
"ory_Orya": "or",
"pan_Guru": "pa",
"san_Deva": "sa",
"sat_Olck": "sat",
"snd_Arab": "sd",
"tam_Taml": "ta",
"tel_Telu": "te",
"urd_Arab": "ur",
}
# Copy the project id from the json file containing API credentials
def translate_text(text, src_lang, tgt_lang, project_id="project_id"):
src_lang = flores_to_iso[src_lang]
tgt_lang = flores_to_iso[tgt_lang]
if src_lang == "mni_Mtei":
src_lang = "mni-Mtei"
if tgt_lang == "mni_Mtei":
tgt_lang = "mni-Mtei"
client = translate.TranslationServiceClient()
location = "global"
parent = f"projects/{project_id}/locations/{location}"
response = client.translate_text(
request={
"parent": parent,
"contents": [text],
"mime_type": "text/plain", # mime types: text/plain, text/html
"source_language_code": src_lang,
"target_language_code": tgt_lang,
}
)
translated_text = ""
for translation in response.translations:
translated_text += translation.translated_text
return translated_text
if __name__ == "__main__":
root_dir = sys.argv[1]
pairs = sorted(glob.glob(os.path.join(root_dir, "*")))
for pair in pairs:
print(pair)
basename = os.path.basename(pair)
src_lang, tgt_lang = basename.split("-")
if src_lang not in flores_to_iso.keys() or tgt_lang not in flores_to_iso.keys():
continue
if src_lang == "eng_Latn":
lang = tgt_lang
else:
lang = src_lang
lang = flores_to_iso[lang]
if lang not in "as bn doi gom gu hi kn mai ml mni_Mtei mr ne or pa sa sd ta te ur":
continue
print(f"{src_lang} - {tgt_lang}")
# source to target translations
src_infname = os.path.join(pair, f"test.{src_lang}")
tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.google")
if os.path.exists(src_infname) and not os.path.exists(tgt_outfname):
src_sents = [
sent.replace("\n", "").strip()
for sent in open(src_infname, "r").read().split("\n")
if sent
]
translations = [
translate_text(text, src_lang, tgt_lang).strip() for text in tqdm(src_sents)
]
with open(tgt_outfname, "w") as f:
f.write("\n".join(translations))
# # target to source translations
tgt_infname = os.path.join(pair, f"test.{tgt_lang}")
src_outfname = os.path.join(pair, f"test.{src_lang}.pred.google")
if os.path.exists(tgt_infname) and not os.path.exists(src_outfname):
tgt_sents = [
sent.replace("\n", "").strip()
for sent in open(tgt_infname, "r").read().split("\n")
if sent
]
translations = [
translate_text(text, tgt_lang, src_lang).strip() for text in tqdm(tgt_sents)
]
with open(src_outfname, "w") as f:
f.write("\n".join(translations))