File size: 3,995 Bytes
103c053 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import sys
import math
class TextProcessor:
def __init__(self, texto):
self.texto = texto
def entropy(self):
simbolos = {}
total_caracteres = len(self.texto)
for caracter in self.texto:
simbolos[caracter] = simbolos.get(caracter, 0) + 1
entropia = 0
for count in simbolos.values():
probabilidad = count / total_caracteres
entropia -= probabilidad * math.log2(probabilidad)
return simbolos, entropia
def common_string(self, cadena1, cadena2):
longitud1 = len(cadena1)
longitud2 = len(cadena2)
comun = ''
subcadenas_comunes = []
for i in range(longitud1):
for j in range(longitud2):
k = 0
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
k += 1
if k > 0:
if subcadenas_comunes:
comun = max(subcadenas_comunes, key=len)
return comun
def magic_split(self):
unique_symbols = set(self.texto)
symbol_distances = {}
for symbol in unique_symbols:
indices = [i for i, char in enumerate(self.texto) if char == symbol]
if len(indices) > 1:
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
symbol_distances[symbol] = distances
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
mins = {}
for v in variation:
if variation[v]!=0 and variation[v]!=1:
mins[v] = variation[v]
best_symbol = min(mins, key=mins.get)
return best_symbol
def rotate_string(self, string, n):
indice = n % len(string)
string_rotado = string[indice:] + string[:indice]
return string_rotado
def rotate_compare(self, tokiA, tokiB):
if tokiA >= tokiB:
tokA = tokiA
tokB = tokiB
ltokA = len(tokA)
tokA = tokiB
tokB = tokiA
ltokA = len(tokB)
i = 0
rotations = {}
while i < ltokA:
tokrotated = self.rotate_string(tokA, i)
rotations[str(i)] = self.common_string(tokrotated, tokB)
i += 1
best_r = ""
for x in rotations:
lb = len(best_r)
rot = rotations[x]
lrot = len(rot)
if lrot > 1 and lrot < ltokA and lrot > lb:
best_r = rot
return best_r
def get_subTokens(self, spl):
sub_tokens = self.texto.split(spl)
toks = []
for tok in sub_tokens:
for tok2 in sub_tokens:
if tok != tok2:
toks.append(self.rotate_compare(tok, tok2))
return list(set(toks))
def tokenize(self, spliter_optimo):
tokens = self.get_subTokens(spliter_optimo)
tokenized_sentence = {}
chunk = self.texto.split(spliter_optimo)
for txt in chunk:
best_split = ""
for tok in tokens:
if tok != "":
lt = len(tok)
lb = len(best_split)
spltxt = txt.split(tok)
if len(spltxt) > 1:
l0 = len(spltxt[0])
l1 = len(spltxt[1])
if lt < len(txt) and lt > lb:
best_split = tok
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
return tokenized_sentence
# Example usage:
texto_ejemplo = sys.argv[1]
text_processor = TextProcessor(texto_ejemplo)
spliter_optimo = text_processor.magic_split()
print("Spliter óptimo:", spliter_optimo)