Update parse_info.py
Browse files- parse_info.py +25 -9
parse_info.py
CHANGED
@@ -1,11 +1,4 @@
|
|
1 |
-
import
|
2 |
-
import chardet, string, gdown, re
|
3 |
-
from pathlib import Path
|
4 |
-
from nltk import everygrams
|
5 |
-
from collections import Counter
|
6 |
-
from typing import List, Optional
|
7 |
-
from datetime import datetime
|
8 |
-
from dateutil import parser, relativedelta
|
9 |
|
10 |
punc = list(string.punctuation)
|
11 |
def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
|
@@ -90,4 +83,27 @@ def parse_designation(inp: List) -> str:
|
|
90 |
|
91 |
def parse_email(inp: List) -> str:
|
92 |
inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
|
93 |
-
return " ".join(inp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from imports import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
punc = list(string.punctuation)
|
4 |
def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
|
|
|
83 |
|
84 |
def parse_email(inp: List) -> str:
|
85 |
inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
|
86 |
+
return " ".join(inp)
|
87 |
+
|
88 |
+
def decontracted(phrase) -> str:
|
89 |
+
phrase = re.sub(r"â€|™|“|”|;|ü|\xad|\xa0|\u200b|·|∙|�|●|�|§|•|!|▪|©|\?|\]|\[|\)|\(", "", phrase)
|
90 |
+
phrase = phrase.strip()
|
91 |
+
phrase = unicodedata.normalize("NFC", phrase)
|
92 |
+
if " " in phrase or " " in phrase: # check space character
|
93 |
+
phrase = phrase.replace(" ","_").replace(" ","_").replace(" ","").replace("_"," ")
|
94 |
+
tmp = phrase.split(" ")
|
95 |
+
check_parse = True
|
96 |
+
for i in tmp:
|
97 |
+
if len(i) > 1:
|
98 |
+
check_parse = False
|
99 |
+
break
|
100 |
+
if check_parse:
|
101 |
+
phrase = phrase.replace(" ","")
|
102 |
+
# phrase = phrase.replace(" "," ").replace(" "," ")
|
103 |
+
return phrase.replace("\n"," ")
|
104 |
+
|
105 |
+
def normalize_bbox(bbox, size): # must normalize bbox to [0;1000]
|
106 |
+
return [int(1000 * bbox[0] / size[0]),
|
107 |
+
int(1000 * bbox[1] / size[1]),
|
108 |
+
int(1000 * bbox[2] / size[0]),
|
109 |
+
int(1000 * bbox[3] / size[1])]
|