pminervini's picture
update
1a7b8b2
raw
history blame
26.1 kB
# coding=utf-8
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility library of instructions."""
import functools
import random
import re
from typing import List
import immutabledict
import nltk
def download_nltk_resources():
"""Download 'punkt' if not already installed"""
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
download_nltk_resources()
WORD_LIST = [
"western",
"sentence",
"signal",
"dump",
"spot",
"opposite",
"bottom",
"potato",
"administration",
"working",
"welcome",
"morning",
"good",
"agency",
"primary",
"wish",
"responsibility",
"press",
"problem",
"president",
"steal",
"brush",
"read",
"type",
"beat",
"trainer",
"growth",
"lock",
"bone",
"case",
"equal",
"comfortable",
"region",
"replacement",
"performance",
"mate",
"walk",
"medicine",
"film",
"thing",
"rock",
"tap",
"total",
"competition",
"ease",
"south",
"establishment",
"gather",
"parking",
"world",
"plenty",
"breath",
"claim",
"alcohol",
"trade",
"dear",
"highlight",
"street",
"matter",
"decision",
"mess",
"agreement",
"studio",
"coach",
"assist",
"brain",
"wing",
"style",
"private",
"top",
"brown",
"leg",
"buy",
"procedure",
"method",
"speed",
"high",
"company",
"valuable",
"pie",
"analyst",
"session",
"pattern",
"district",
"pleasure",
"dinner",
"swimming",
"joke",
"order",
"plate",
"department",
"motor",
"cell",
"spend",
"cabinet",
"difference",
"power",
"examination",
"engine",
"horse",
"dimension",
"pay",
"toe",
"curve",
"literature",
"bother",
"fire",
"possibility",
"debate",
"activity",
"passage",
"hello",
"cycle",
"background",
"quiet",
"author",
"effect",
"actor",
"page",
"bicycle",
"error",
"throat",
"attack",
"character",
"phone",
"tea",
"increase",
"outcome",
"file",
"specific",
"inspector",
"internal",
"potential",
"staff",
"building",
"employer",
"shoe",
"hand",
"direction",
"garden",
"purchase",
"interview",
"study",
"recognition",
"member",
"spiritual",
"oven",
"sandwich",
"weird",
"passenger",
"particular",
"response",
"reaction",
"size",
"variation",
"a",
"cancel",
"candy",
"exit",
"guest",
"condition",
"fly",
"price",
"weakness",
"convert",
"hotel",
"great",
"mouth",
"mind",
"song",
"sugar",
"suspect",
"telephone",
"ear",
"roof",
"paint",
"refrigerator",
"organization",
"jury",
"reward",
"engineering",
"day",
"possession",
"crew",
"bar",
"road",
"description",
"celebration",
"score",
"mark",
"letter",
"shower",
"suggestion",
"sir",
"luck",
"national",
"progress",
"hall",
"stroke",
"theory",
"offer",
"story",
"tax",
"definition",
"history",
"ride",
"medium",
"opening",
"glass",
"elevator",
"stomach",
"question",
"ability",
"leading",
"village",
"computer",
"city",
"grand",
"confidence",
"candle",
"priest",
"recommendation",
"point",
"necessary",
"body",
"desk",
"secret",
"horror",
"noise",
"culture",
"warning",
"water",
"round",
"diet",
"flower",
"bus",
"tough",
"permission",
"week",
"prompt",
"connection",
"abuse",
"height",
"save",
"corner",
"border",
"stress",
"drive",
"stop",
"rip",
"meal",
"listen",
"confusion",
"girlfriend",
"living",
"relation",
"significance",
"plan",
"creative",
"atmosphere",
"blame",
"invite",
"housing",
"paper",
"drink",
"roll",
"silver",
"drunk",
"age",
"damage",
"smoke",
"environment",
"pack",
"savings",
"influence",
"tourist",
"rain",
"post",
"sign",
"grandmother",
"run",
"profit",
"push",
"clerk",
"final",
"wine",
"swim",
"pause",
"stuff",
"singer",
"funeral",
"average",
"source",
"scene",
"tradition",
"personal",
"snow",
"nobody",
"distance",
"sort",
"sensitive",
"animal",
"major",
"negotiation",
"click",
"mood",
"period",
"arrival",
"expression",
"holiday",
"repeat",
"dust",
"closet",
"gold",
"bad",
"sail",
"combination",
"clothes",
"emphasis",
"duty",
"black",
"step",
"school",
"jump",
"document",
"professional",
"lip",
"chemical",
"front",
"wake",
"while",
"inside",
"watch",
"row",
"subject",
"penalty",
"balance",
"possible",
"adult",
"aside",
"sample",
"appeal",
"wedding",
"depth",
"king",
"award",
"wife",
"blow",
"site",
"camp",
"music",
"safe",
"gift",
"fault",
"guess",
"act",
"shame",
"drama",
"capital",
"exam",
"stupid",
"record",
"sound",
"swing",
"novel",
"minimum",
"ratio",
"machine",
"shape",
"lead",
"operation",
"salary",
"cloud",
"affair",
"hit",
"chapter",
"stage",
"quantity",
"access",
"army",
"chain",
"traffic",
"kick",
"analysis",
"airport",
"time",
"vacation",
"philosophy",
"ball",
"chest",
"thanks",
"place",
"mountain",
"advertising",
"red",
"past",
"rent",
"return",
"tour",
"house",
"construction",
"net",
"native",
"war",
"figure",
"fee",
"spray",
"user",
"dirt",
"shot",
"task",
"stick",
"friend",
"software",
"promotion",
"interaction",
"surround",
"block",
"purpose",
"practice",
"conflict",
"routine",
"requirement",
"bonus",
"hole",
"state",
"junior",
"sweet",
"catch",
"tear",
"fold",
"wall",
"editor",
"life",
"position",
"pound",
"respect",
"bathroom",
"coat",
"script",
"job",
"teach",
"birth",
"view",
"resolve",
"theme",
"employee",
"doubt",
"market",
"education",
"serve",
"recover",
"tone",
"harm",
"miss",
"union",
"understanding",
"cow",
"river",
"association",
"concept",
"training",
"recipe",
"relationship",
"reserve",
"depression",
"proof",
"hair",
"revenue",
"independent",
"lift",
"assignment",
"temporary",
"amount",
"loss",
"edge",
"track",
"check",
"rope",
"estimate",
"pollution",
"stable",
"message",
"delivery",
"perspective",
"mirror",
"assistant",
"representative",
"witness",
"nature",
"judge",
"fruit",
"tip",
"devil",
"town",
"emergency",
"upper",
"drop",
"stay",
"human",
"neck",
"speaker",
"network",
"sing",
"resist",
"league",
"trip",
"signature",
"lawyer",
"importance",
"gas",
"choice",
"engineer",
"success",
"part",
"external",
"worker",
"simple",
"quarter",
"student",
"heart",
"pass",
"spite",
"shift",
"rough",
"lady",
"grass",
"community",
"garage",
"youth",
"standard",
"skirt",
"promise",
"blind",
"television",
"disease",
"commission",
"positive",
"energy",
"calm",
"presence",
"tune",
"basis",
"preference",
"head",
"common",
"cut",
"somewhere",
"presentation",
"current",
"thought",
"revolution",
"effort",
"master",
"implement",
"republic",
"floor",
"principle",
"stranger",
"shoulder",
"grade",
"button",
"tennis",
"police",
"collection",
"account",
"register",
"glove",
"divide",
"professor",
"chair",
"priority",
"combine",
"peace",
"extension",
"maybe",
"evening",
"frame",
"sister",
"wave",
"code",
"application",
"mouse",
"match",
"counter",
"bottle",
"half",
"cheek",
"resolution",
"back",
"knowledge",
"make",
"discussion",
"screw",
"length",
"accident",
"battle",
"dress",
"knee",
"log",
"package",
"it",
"turn",
"hearing",
"newspaper",
"layer",
"wealth",
"profile",
"imagination",
"answer",
"weekend",
"teacher",
"appearance",
"meet",
"bike",
"rise",
"belt",
"crash",
"bowl",
"equivalent",
"support",
"image",
"poem",
"risk",
"excitement",
"remote",
"secretary",
"public",
"produce",
"plane",
"display",
"money",
"sand",
"situation",
"punch",
"customer",
"title",
"shake",
"mortgage",
"option",
"number",
"pop",
"window",
"extent",
"nothing",
"experience",
"opinion",
"departure",
"dance",
"indication",
"boy",
"material",
"band",
"leader",
"sun",
"beautiful",
"muscle",
"farmer",
"variety",
"fat",
"handle",
"director",
"opportunity",
"calendar",
"outside",
"pace",
"bath",
"fish",
"consequence",
"put",
"owner",
"go",
"doctor",
"information",
"share",
"hurt",
"protection",
"career",
"finance",
"force",
"golf",
"garbage",
"aspect",
"kid",
"food",
"boot",
"milk",
"respond",
"objective",
"reality",
"raw",
"ring",
"mall",
"one",
"impact",
"area",
"news",
"international",
"series",
"impress",
"mother",
"shelter",
"strike",
"loan",
"month",
"seat",
"anything",
"entertainment",
"familiar",
"clue",
"year",
"glad",
"supermarket",
"natural",
"god",
"cost",
"conversation",
"tie",
"ruin",
"comfort",
"earth",
"storm",
"percentage",
"assistance",
"budget",
"strength",
"beginning",
"sleep",
"other",
"young",
"unit",
"fill",
"store",
"desire",
"hide",
"value",
"cup",
"maintenance",
"nurse",
"function",
"tower",
"role",
"class",
"camera",
"database",
"panic",
"nation",
"basket",
"ice",
"art",
"spirit",
"chart",
"exchange",
"feedback",
"statement",
"reputation",
"search",
"hunt",
"exercise",
"nasty",
"notice",
"male",
"yard",
"annual",
"collar",
"date",
"platform",
"plant",
"fortune",
"passion",
"friendship",
"spread",
"cancer",
"ticket",
"attitude",
"island",
"active",
"object",
"service",
"buyer",
"bite",
"card",
"face",
"steak",
"proposal",
"patient",
"heat",
"rule",
"resident",
"broad",
"politics",
"west",
"knife",
"expert",
"girl",
"design",
"salt",
"baseball",
"grab",
"inspection",
"cousin",
"couple",
"magazine",
"cook",
"dependent",
"security",
"chicken",
"version",
"currency",
"ladder",
"scheme",
"kitchen",
"employment",
"local",
"attention",
"manager",
"fact",
"cover",
"sad",
"guard",
"relative",
"county",
"rate",
"lunch",
"program",
"initiative",
"gear",
"bridge",
"breast",
"talk",
"dish",
"guarantee",
"beer",
"vehicle",
"reception",
"woman",
"substance",
"copy",
"lecture",
"advantage",
"park",
"cold",
"death",
"mix",
"hold",
"scale",
"tomorrow",
"blood",
"request",
"green",
"cookie",
"church",
"strip",
"forever",
"beyond",
"debt",
"tackle",
"wash",
"following",
"feel",
"maximum",
"sector",
"sea",
"property",
"economics",
"menu",
"bench",
"try",
"language",
"start",
"call",
"solid",
"address",
"income",
"foot",
"senior",
"honey",
"few",
"mixture",
"cash",
"grocery",
"link",
"map",
"form",
"factor",
"pot",
"model",
"writer",
"farm",
"winter",
"skill",
"anywhere",
"birthday",
"policy",
"release",
"husband",
"lab",
"hurry",
"mail",
"equipment",
"sink",
"pair",
"driver",
"consideration",
"leather",
"skin",
"blue",
"boat",
"sale",
"brick",
"two",
"feed",
"square",
"dot",
"rush",
"dream",
"location",
"afternoon",
"manufacturer",
"control",
"occasion",
"trouble",
"introduction",
"advice",
"bet",
"eat",
"kill",
"category",
"manner",
"office",
"estate",
"pride",
"awareness",
"slip",
"crack",
"client",
"nail",
"shoot",
"membership",
"soft",
"anybody",
"web",
"official",
"individual",
"pizza",
"interest",
"bag",
"spell",
"profession",
"queen",
"deal",
"resource",
"ship",
"guy",
"chocolate",
"joint",
"formal",
"upstairs",
"car",
"resort",
"abroad",
"dealer",
"associate",
"finger",
"surgery",
"comment",
"team",
"detail",
"crazy",
"path",
"tale",
"initial",
"arm",
"radio",
"demand",
"single",
"draw",
"yellow",
"contest",
"piece",
"quote",
"pull",
"commercial",
"shirt",
"contribution",
"cream",
"channel",
"suit",
"discipline",
"instruction",
"concert",
"speech",
"low",
"effective",
"hang",
"scratch",
"industry",
"breakfast",
"lay",
"join",
"metal",
"bedroom",
"minute",
"product",
"rest",
"temperature",
"many",
"give",
"argument",
"print",
"purple",
"laugh",
"health",
"credit",
"investment",
"sell",
"setting",
"lesson",
"egg",
"middle",
"marriage",
"level",
"evidence",
"phrase",
"love",
"self",
"benefit",
"guidance",
"affect",
"you",
"dad",
"anxiety",
"special",
"boyfriend",
"test",
"blank",
"payment",
"soup",
"obligation",
"reply",
"smile",
"deep",
"complaint",
"addition",
"review",
"box",
"towel",
"minor",
"fun",
"soil",
"issue",
"cigarette",
"internet",
"gain",
"tell",
"entry",
"spare",
"incident",
"family",
"refuse",
"branch",
"can",
"pen",
"grandfather",
"constant",
"tank",
"uncle",
"climate",
"ground",
"volume",
"communication",
"kind",
"poet",
"child",
"screen",
"mine",
"quit",
"gene",
"lack",
"charity",
"memory",
"tooth",
"fear",
"mention",
"marketing",
"reveal",
"reason",
"court",
"season",
"freedom",
"land",
"sport",
"audience",
"classroom",
"law",
"hook",
"win",
"carry",
"eye",
"smell",
"distribution",
"research",
"country",
"dare",
"hope",
"whereas",
"stretch",
"library",
"if",
"delay",
"college",
"plastic",
"book",
"present",
"use",
"worry",
"champion",
"goal",
"economy",
"march",
"election",
"reflection",
"midnight",
"slide",
"inflation",
"action",
"challenge",
"guitar",
"coast",
"apple",
"campaign",
"field",
"jacket",
"sense",
"way",
"visual",
"remove",
"weather",
"trash",
"cable",
"regret",
"buddy",
"beach",
"historian",
"courage",
"sympathy",
"truck",
"tension",
"permit",
"nose",
"bed",
"son",
"person",
"base",
"meat",
"usual",
"air",
"meeting",
"worth",
"game",
"independence",
"physical",
"brief",
"play",
"raise",
"board",
"she",
"key",
"writing",
"pick",
"command",
"party",
"yesterday",
"spring",
"candidate",
"physics",
"university",
"concern",
"development",
"change",
"string",
"target",
"instance",
"room",
"bitter",
"bird",
"football",
"normal",
"split",
"impression",
"wood",
"long",
"meaning",
"stock",
"cap",
"leadership",
"media",
"ambition",
"fishing",
"essay",
"salad",
"repair",
"today",
"designer",
"night",
"bank",
"drawing",
"inevitable",
"phase",
"vast",
"chip",
"anger",
"switch",
"cry",
"twist",
"personality",
"attempt",
"storage",
"being",
"preparation",
"bat",
"selection",
"white",
"technology",
"contract",
"side",
"section",
"station",
"till",
"structure",
"tongue",
"taste",
"truth",
"difficulty",
"group",
"limit",
"main",
"move",
"feeling",
"light",
"example",
"mission",
"might",
"wait",
"wheel",
"shop",
"host",
"classic",
"alternative",
"cause",
"agent",
"consist",
"table",
"airline",
"text",
"pool",
"craft",
"range",
"fuel",
"tool",
"partner",
"load",
"entrance",
"deposit",
"hate",
"article",
"video",
"summer",
"feature",
"extreme",
"mobile",
"hospital",
"flight",
"fall",
"pension",
"piano",
"fail",
"result",
"rub",
"gap",
"system",
"report",
"suck",
"ordinary",
"wind",
"nerve",
"ask",
"shine",
"note",
"line",
"mom",
"perception",
"brother",
"reference",
"bend",
"charge",
"treat",
"trick",
"term",
"homework",
"bake",
"bid",
"status",
"project",
"strategy",
"orange",
"let",
"enthusiasm",
"parent",
"concentrate",
"device",
"travel",
"poetry",
"business",
"society",
"kiss",
"end",
"vegetable",
"employ",
"schedule",
"hour",
"brave",
"focus",
"process",
"movie",
"illegal",
"general",
"coffee",
"ad",
"highway",
"chemistry",
"psychology",
"hire",
"bell",
"conference",
"relief",
"show",
"neat",
"funny",
"weight",
"quality",
"club",
"daughter",
"zone",
"touch",
"tonight",
"shock",
"burn",
"excuse",
"name",
"survey",
"landscape",
"advance",
"satisfaction",
"bread",
"disaster",
"item",
"hat",
"prior",
"shopping",
"visit",
"east",
"photo",
"home",
"idea",
"father",
"comparison",
"cat",
"pipe",
"winner",
"count",
"lake",
"fight",
"prize",
"foundation",
"dog",
"keep",
"ideal",
"fan",
"struggle",
"peak",
"safety",
"solution",
"hell",
"conclusion",
"population",
"strain",
"alarm",
"measurement",
"second",
"train",
"race",
"due",
"insurance",
"boss",
"tree",
"monitor",
"sick",
"course",
"drag",
"appointment",
"slice",
"still",
"care",
"patience",
"rich",
"escape",
"emotion",
"royal",
"female",
"childhood",
"government",
"picture",
"will",
"sock",
"big",
"gate",
"oil",
"cross",
"pin",
"improvement",
"championship",
"silly",
"help",
"sky",
"pitch",
"man",
"diamond",
"most",
"transition",
"work",
"science",
"committee",
"moment",
"fix",
"teaching",
"dig",
"specialist",
"complex",
"guide",
"people",
"dead",
"voice",
"original",
"break",
"topic",
"data",
"degree",
"reading",
"recording",
"bunch",
"reach",
"judgment",
"lie",
"regular",
"set",
"painting",
"mode",
"list",
"player",
"bear",
"north",
"wonder",
"carpet",
"heavy",
"officer",
"negative",
"clock",
"unique",
"baby",
"pain",
"assumption",
"disk",
"iron",
"bill",
"drawer",
"look",
"double",
"mistake",
"finish",
"future",
"brilliant",
"contact",
"math",
"rice",
"leave",
"restaurant",
"discount",
"sex",
"virus",
"bit",
"trust",
"event",
"wear",
"juice",
"failure",
"bug",
"context",
"mud",
"whole",
"wrap",
"intention",
"draft",
"pressure",
"cake",
"dark",
"explanation",
"space",
"angle",
"word",
"efficiency",
"management",
"habit",
"star",
"chance",
"finding",
"transportation",
"stand",
"criticism",
"flow",
"door",
"injury",
"insect",
"surprise",
"apartment",
] # pylint: disable=line-too-long
# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict(
{
"en": "English",
"es": "Spanish",
"pt": "Portuguese",
"ar": "Arabic",
"hi": "Hindi",
"fr": "French",
"ru": "Russian",
"de": "German",
"ja": "Japanese",
"it": "Italian",
"bn": "Bengali",
"uk": "Ukrainian",
"th": "Thai",
"ur": "Urdu",
"ta": "Tamil",
"te": "Telugu",
"bg": "Bulgarian",
"ko": "Korean",
"pl": "Polish",
"he": "Hebrew",
"fa": "Persian",
"vi": "Vietnamese",
"ne": "Nepali",
"sw": "Swahili",
"kn": "Kannada",
"mr": "Marathi",
"gu": "Gujarati",
"pa": "Punjabi",
"ml": "Malayalam",
"fi": "Finnish",
}
)
_ALPHABETS = "([A-Za-z])"
_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
_DIGITS = "([0-9])"
_MULTIPLE_DOTS = r"\.{2,}"
def split_into_sentences(text):
"""Split the text into sentences.
Args:
text: A string that consists of more than or equal to one sentences.
Returns:
A list of strings where each string is a sentence.
"""
text = " " + text + " "
text = text.replace("\n", " ")
text = re.sub(_PREFIXES, "\\1<prd>", text)
text = re.sub(_WEBSITES, "<prd>\\1", text)
text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
text = re.sub(
_MULTIPLE_DOTS,
lambda match: "<prd>" * len(match.group(0)) + "<stop>",
text,
)
if "Ph.D" in text:
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
text = re.sub(
_ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
"\\1<prd>\\2<prd>\\3<prd>",
text,
)
text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
if "”" in text:
text = text.replace(".”", "”.")
if '"' in text:
text = text.replace('."', '".')
if "!" in text:
text = text.replace('!"', '"!')
if "?" in text:
text = text.replace('?"', '"?')
text = text.replace(".", ".<stop>")
text = text.replace("?", "?<stop>")
text = text.replace("!", "!<stop>")
text = text.replace("<prd>", ".")
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences]
if sentences and not sentences[-1]:
sentences = sentences[:-1]
return sentences
def count_words(text):
"""Counts the number of words."""
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(text)
num_words = len(tokens)
return num_words
@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
def count_sentences(text):
"""Count the number of sentences."""
tokenizer = _get_sentence_tokenizer()
tokenized_sentences = tokenizer.tokenize(text)
return len(tokenized_sentences)
def generate_keywords(num_keywords):
"""Randomly generates a few keywords."""
return random.sample(WORD_LIST, k=num_keywords)