from sentencepiece import SentencePieceProcessor import onnxruntime as ort import numpy as np from typing import List # Load the classification and sentencepiece models model_path = "sbd_25lang.onnx" tokenizer_path = "32k_mixed_case_25lang.model" tokenizer: SentencePieceProcessor = SentencePieceProcessor(tokenizer_path) ort_session: ort.InferenceSession = ort.InferenceSession(model_path) # Function to make a simple batch for inference def simple_make_batch(texts: List[str], tokenizer: SentencePieceProcessor): """Note that a real data loader will account for the model's max input length""" # Encode each input; add EOS and BOS tags bos = tokenizer.bos_id() eos = tokenizer.eos_id() ids_list = [[bos] + tokenizer.EncodeAsIds(x) + [eos] for x in texts] # Pack input IDs into a padded batch batch_size = len(texts) lengths = np.array([len(x) for x in ids_list]) max_len = max(lengths) input_ids = np.full(shape=[batch_size, max_len], fill_value=tokenizer.pad_id(), dtype=np.int64) for i, ids in enumerate(ids_list): input_ids[i, : len(ids)] = ids return input_ids, lengths # Applies model predictions to break up a batch of texts def apply_preds(input_ids: np.array, probs: np.array, lengths: np.array, threshold: float) -> List[List[str]]: out_strings: List[List[str]] = [] for i, length in enumerate(lengths): next_strings: List[str] = [] # Ignore EOS/BOS next_ids = input_ids[i, 1 : length + 1].tolist() next_probs = probs[i, 1 : length + 1] # Find all positions that exceed the threshold as a sentence boundary break_points: List[int] = np.squeeze(np.argwhere(next_probs > threshold)).tolist() # Add the final token to the break points, to not have leftover tokens after the loop if (not break_points) or (break_points[-1] != len(next_ids) - 1): break_points.append(len(next_ids) - 1) # Break tokens at boundaries, convert back to text for break_num, break_point in enumerate(break_points): start = 0 if break_num == 0 else (break_points[break_num - 1] + 1) sub_ids = next_ids[start : break_point + 1] sub_text = tokenizer.DecodeIds(sub_ids) next_strings.append(sub_text) out_strings.append(next_strings) return out_strings # Pretty-prints the input texts and the segmented output texts def pretty_print_input_outputs(input_texts: List[str], output_texts: List[List[str]]) -> None: for i, (input_text, next_output_texts) in enumerate(zip(input_texts, output_texts)): print(f"Input {i}: {input_text}") print("Outputs:") for output in next_output_texts: print(f"\t{output}") # Define a function to run inference and print outputs def run_example(input_texts: List[str], tokenizer: tokenizer): # Make a simple batch input_ids, lengths = simple_make_batch(input_texts, tokenizer) # Run inference and pretty-print the outputs outputs = ort_session.run(None, {"input_ids": input_ids}) probs = outputs[0] segmented_texts = apply_preds(input_ids=input_ids, probs=probs, lengths=lengths, threshold=0.5) pretty_print_input_outputs(input_texts, segmented_texts) # Random texts from the test partition input_texts = [ # ar "إنه يبدو غالياً قليلاً بالنسبة لمبنى يحتاج إلى بعض الإصلاحات، هذا كل ما فى الأمر. هذا جيد. ماما وادك بالمنزل.", # bn "তবে ট্রেলারে তার দেখা মিলল না। পরে ব্যাগটি তল্লাশি করে ইয়াবা, চারটি স্বর্ণের বার ও নগদ সাড়ে চার লাখ টাকা পাওয়া যায়। এতে মাছটির দাম বেড়ে গেছে।", # de "Auf alle Fälle sind 90 Minuten voller Einsatz und Konzentration gefordert. Amos Vogel legte seinen Fokus auf die " "Kontextualisierung der einzelnen Filme durch gezielte Programmarbeit. Georgi verspricht Es wird Feierlichkeiten " "geben.", # en "Irwin wrote on Instagram. However, money does talk and the Roosters reported $130000 offer might not be up to " "scratch. There are doctors, but since they joined the civil disobedience protest movement, they can't come to the " "hospital.", # es "Y sí, efectivamente, nos estamos refiriendo a la NASA, la ESA, etc. También tenía un servicio de bar y venta de " "refrescos y bebidas alcohólicas, así como otros productos. En cuanto a la Libre Determinación, su importe se " "distribuirá de forma anual.", # et "Hiljuti suure tagasituleku teinud Mike Tyson naaseb taas poksiringi. Riigikogu liikme Kert Kingo hinnangul on " "valitsus riigikokku toonud seaduseelnõu, millega tahetakse riigis kehtestada totalitaarset kontrolli. Kehakaalu " "tõusu tingib liigne energia tarbimine, mida organism ei jõua päeva jooksul ära kulutada.", # fi "Yksi ongelma ratkaistu. Kuinka kauan olette olleet Meksikossa? Kävin jokin aika sitten tapaamassa komisario " "Fournieria tärkeässä asiassa.", # fr "Avec cette série, on sent quelque chose. En rafale, il pourra dépasser les 65 kmh. Le mercure restera homogène à " "16 °C. Désormais à 10 contre 10 avec un but de retard, Arsenal est complètement relancé dans cette demi finale " "aller.", # hi "प्रभु चरणणों में यही विनती है कि हमारे विचाररों को सदैव पवित्र बनाए रखने की कृपा बनी रहे। भारत में यह सेरेमनी डिज्नी स्टार के चैनललों पर देखी जा सकती है। इस " "कार्रवाई से होटललों और ललांज में अफरा तफरी मच गई है।", # id "Masyarakat juga harus menjalankan protokol kesehatan dengan disiplin dan juga segera divaksinasi bagi yang belum. " "Pelaku ingin nikah, tetapi tidak direstui bapaknya. Untuk selanjutnya, setiap produksi Seven Bucks, baik TV, film " "atau apa pun, kami tidak akan lagi menggunakan senjata sungguhan.", # is "Síðustu vikurnar voru móður okkar hreint kvalræði. Helena Ólafsdóttir og Guðlaug Jónsdóttir knattspyrnukempur " "gengu í heilagt hjónaband um helgina. Þess vegna er alltaf verið að tala um að vera ekki við hraunjaðarinn.", # it "Credevo di potermi fidare di lei. Andare con loro, portarli fuori dal porto? I romani, prima di andarsene, " "sommersero la città. I tuoi antenati custodirono il segreto.", # ja "毛先をそろえる程度で?持たないよ。レベッカ上品な物言いで頼む。またイェーガーに乗ったらあなたは死にます。", # lt "Mokytojau, nusiimk skrybėlę. Kur aš tokia eisiu? Mes galime važiuoti kitą vakarą. Tik arbatos, ačiū.", # lv "Viņa ir gatava izveidot uzbrucēja fotorobotu. Jā. Dažas no labākajām snaudām mūžā esmu izbaudījis Karaliskajā " "Šekspīra teātrī. Kas viņu iedrošinās vēl vairāk.", # ko "하지만 원전 추진에 관한 내부 문건이 분명히 더 있을 겁니다. 한국은 어떤가. 반면 글로벌 제약사들은 이미 콜드체인에 관해 큰 관심을 갖고 있었다.", # no "Ash Ketchum drømmer om eventyr. Ingen i Starfleet kunne gjøre dette. Vil du bli påkjørt? Det går så bra her.", # nl "Ik heb een paar dagen nodig om de mannen weer te verzamelen. Heeft ze een interessant accent? We moeten deze kant " "op. Ik kon er niks aan doen.", # pl "Potraktowaliśmy ją bardzo poważnie. Prace Larsa Vilksa są obecnie wystawiane na wystawie Sztuka polityczna w " "Centrum Sztuki Współczesnej Zamek Ujazdowski w Warszawie. Czy do uznania choroby zawodowej trzeba wykazać, że " "nigdzie poza pracą nie było się narażonym na szkodliwe czynniki?", # pt "John Kennedy, titular em sua ausência, está suspenso com três cartões amarelos. Esses três pilares para a " "imigração, citados pela pesquisa do ESCOE, começaram a cambalear com a pandemia. As poucas oportunidades que teve " "no profissional do Defensor o fizeram reavaliar o futuro.", # ru "Они не знали, что я просто угараю, следует из композиции Элджея. Нас слишком мало, чтобы потерять сотни жизней из " "за злоупотребления алкоголем, когда мы выйдем из пандемии коронавируса. Катар начал предлагать туры для " "болельщиков на ЧМ 2022.", # tr "Yani konjonktürel şartlarla pozisyon alan NATO üyeleri kurumsal işbirliğine zarar veren bir dönemi geçirdi NATO. " "Daha sonra insanları sosyal medyadan bilgilendirdik. Devam eden şampiyonada ülkemizi temsil eden diğer sporculara " "başarılar diliyorum dedi.", # sv "Öppna dina fläskiga, sympatiska armar? Vi spelar. 50 dollar per poäng. Jag menari gär, förraveckan.", # uk "Треба бути тепер готовими до наступного. Багато дітей опинилися ізольованими вдома, а час, який вони проводять у " "мережі, збільшився у рази. Це мала б супроводжувати дискусія у ЗМІ та суспільстві, аби було зрозуміло, чому саме " "це пропаганда, були наведені докази.", # zh "不断刷新的数字背后是满满民生情。但另两名非盟官员说,莫德纳要到明年才能交付疫苗,导致谈判破裂。温室气体等大气本底观测是一项专业性很强的工作,容不得一点马虎。", ] # Run example with every language. print("Example from each language:") run_example(input_texts, tokenizer) # Run ean example with a lot of English acronyms, to see model behavior with non-breaking fullstops. Some random texts # from opensubtitles. input_texts = [ "R.J. MacReady, helicopter pilot, U.S. Outpost Number 31. How will I get to L.A.? He's a U.S. Marshal. Let him go. " "Let him go. Let me see your license and I.D. Card." ] print("Example with lots of acronyms:") run_example(input_texts, tokenizer) # Run with the acronyms in lower-case, to deny the model the true-case information when making decisions print("Example with lower-cased inputs:") input_texts = [x.lower() for x in input_texts] run_example(input_texts, tokenizer)