RUPunct_big - самая большая модель из семейства RUPunct. Подходит для большинства задач.
Код инференса:
from transformers import pipeline
from transformers import AutoTokenizer
pt = "RUPunct/RUPunct_big"
tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True)
classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first")
def process_token(token, label):
if label == "LOWER_O":
return token
if label == "LOWER_PERIOD":
return token + "."
if label == "LOWER_COMMA":
return token + ","
if label == "LOWER_QUESTION":
return token + "?"
if label == "LOWER_TIRE":
return token + "—"
if label == "LOWER_DVOETOCHIE":
return token + ":"
if label == "LOWER_VOSKL":
return token + "!"
if label == "LOWER_PERIODCOMMA":
return token + ";"
if label == "LOWER_DEFIS":
return token + "-"
if label == "LOWER_MNOGOTOCHIE":
return token + "..."
if label == "LOWER_QUESTIONVOSKL":
return token + "?!"
if label == "UPPER_O":
return token.capitalize()
if label == "UPPER_PERIOD":
return token.capitalize() + "."
if label == "UPPER_COMMA":
return token.capitalize() + ","
if label == "UPPER_QUESTION":
return token.capitalize() + "?"
if label == "UPPER_TIRE":
return token.capitalize() + " —"
if label == "UPPER_DVOETOCHIE":
return token.capitalize() + ":"
if label == "UPPER_VOSKL":
return token.capitalize() + "!"
if label == "UPPER_PERIODCOMMA":
return token.capitalize() + ";"
if label == "UPPER_DEFIS":
return token.capitalize() + "-"
if label == "UPPER_MNOGOTOCHIE":
return token.capitalize() + "..."
if label == "UPPER_QUESTIONVOSKL":
return token.capitalize() + "?!"
if label == "UPPER_TOTAL_O":
return token.upper()
if label == "UPPER_TOTAL_PERIOD":
return token.upper() + "."
if label == "UPPER_TOTAL_COMMA":
return token.upper() + ","
if label == "UPPER_TOTAL_QUESTION":
return token.upper() + "?"
if label == "UPPER_TOTAL_TIRE":
return token.upper() + " —"
if label == "UPPER_TOTAL_DVOETOCHIE":
return token.upper() + ":"
if label == "UPPER_TOTAL_VOSKL":
return token.upper() + "!"
if label == "UPPER_TOTAL_PERIODCOMMA":
return token.upper() + ";"
if label == "UPPER_TOTAL_DEFIS":
return token.upper() + "-"
if label == "UPPER_TOTAL_MNOGOTOCHIE":
return token.upper() + "..."
if label == "UPPER_TOTAL_QUESTIONVOSKL":
return token.upper() + "?!"
while 1:
input_text = input(":> ")
preds = classifier(input_text)
output = ""
for item in preds:
output += " " + process_token(item['word'].strip(), item['entity_group'])
print(">>>", output)
- Downloads last month
- 3,064
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.