gamingflexer commited on
Commit
9269cc3
·
1 Parent(s): eb08878

Pdf Text extraction

Browse files
Files changed (1) hide show
  1. src/plagiarism/preprocessing.py +142 -0
src/plagiarism/preprocessing.py CHANGED
@@ -1,5 +1,147 @@
1
  from nltk.corpus import stopwords
2
  from nltk.stem import WordNetLemmatizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def remove_numbers(words_list: list) -> list:
 
1
  from nltk.corpus import stopwords
2
  from nltk.stem import WordNetLemmatizer
3
+ import time
4
+ from operator import itemgetter
5
+ import fitz
6
+ import re
7
+
8
+ def fonts(doc, granularity=False):
9
+ """Extracts fonts and their usage in PDF documents.
10
+
11
+ :param doc: PDF document to iterate through
12
+ :type doc: <class 'fitz.fitz.Document'>
13
+ :param granularity: also use 'font', 'flags' and 'color' to discriminate text
14
+ :type granularity: bool
15
+
16
+ :rtype: [(font_size, count), (font_size, count}], dict
17
+ :return: most used fonts sorted by count, font style information
18
+ """
19
+ styles = {}
20
+ font_counts = {}
21
+
22
+ for page in doc:
23
+ blocks = page.get_text("dict")["blocks"]
24
+ for b in blocks: # iterate through the text blocks
25
+ if b['type'] == 0: # block contains text
26
+ for l in b["lines"]: # iterate through the text lines
27
+ for s in l["spans"]: # iterate through the text spans
28
+ if granularity:
29
+ identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
30
+ styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
31
+ 'color': s['color']}
32
+ else:
33
+ identifier = "{0}".format(s['size'])
34
+ styles[identifier] = {'size': s['size'], 'font': s['font']}
35
+
36
+ font_counts[identifier] = font_counts.get(identifier, 0) + 1 # count the fonts usage
37
+
38
+ font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)
39
+
40
+ if len(font_counts) < 1:
41
+ raise ValueError("Zero discriminating fonts found!")
42
+
43
+ return font_counts, styles
44
+
45
+
46
+ def font_tags(font_counts, styles):
47
+ """Returns dictionary with font sizes as keys and tags as value.
48
+
49
+ :param font_counts: (font_size, count) for all fonts occuring in document
50
+ :type font_counts: list
51
+ :param styles: all styles found in the document
52
+ :type styles: dict
53
+
54
+ :rtype: dict
55
+ :return: all element tags based on font-sizes
56
+ """
57
+ p_style = styles[font_counts[0][0]] # get style for most used font by count (paragraph)
58
+ p_size = p_style['size'] # get the paragraph's size
59
+
60
+ # sorting the font sizes high to low, so that we can append the right integer to each tag
61
+ font_sizes = []
62
+ for (font_size, count) in font_counts:
63
+ font_sizes.append(float(font_size))
64
+ font_sizes.sort(reverse=True)
65
+
66
+ # aggregating the tags for each font size
67
+ idx = 0
68
+ size_tag = {}
69
+ for size in font_sizes:
70
+ idx += 1
71
+ if size == p_size:
72
+ idx = 0
73
+ size_tag[size] = '<p>'
74
+ if size > p_size:
75
+ size_tag[size] = '<h{0}>'.format(idx)
76
+ elif size < p_size:
77
+ size_tag[size] = '<s{0}>'.format(idx)
78
+
79
+ return size_tag
80
+
81
+
82
+ def headers_para(doc, size_tag):
83
+ """Scrapes headers & paragraphs from PDF and return texts with element tags.
84
+
85
+ :param doc: PDF document to iterate through
86
+ :type doc: <class 'fitz.fitz.Document'>
87
+ :param size_tag: textual element tags for each size
88
+ :type size_tag: dict
89
+
90
+ :rtype: list
91
+ :return: texts with pre-prended element tags
92
+ """
93
+ paragraphs = [] # list with paragraphs
94
+ first = True # boolean operator for first header
95
+ previous_s = {} # previous span
96
+
97
+ for page in doc:
98
+ blocks = page.get_text("dict")["blocks"]
99
+ for b in blocks: # iterate through the text blocks
100
+ if b['type'] == 0: # this block contains text
101
+
102
+ # REMEMBER: multiple fonts and sizes are possible IN one block
103
+
104
+ block_string = "" # text found in block
105
+ for l in b["lines"]: # iterate through the text lines
106
+ for s in l["spans"]: # iterate through the text spans
107
+ if s['text'].strip(): # removing whitespaces:
108
+ if first:
109
+ previous_s = s
110
+ first = False
111
+ block_string = s['text'] if size_tag[s['size']] == '<p>' else ''
112
+ else:
113
+ if s['size'] == previous_s['size']:
114
+ if block_string: # in the same block, so concatenate strings
115
+ block_string += " " + s['text']
116
+ else:
117
+ if block_string: # new block has started, so append the paragraph
118
+ paragraphs.append(block_string)
119
+ block_string = s['text'] if size_tag[s['size']] == '<p>' else ''
120
+
121
+ previous_s = s
122
+
123
+ if block_string: # append the last paragraph in the block
124
+ if len(block_string) > 80:
125
+ # print(len(block_string), block_string,'\n')
126
+ paragraphs.append(block_string)
127
+
128
+ return paragraphs
129
+
130
+ def get_pdf_info(document_path):
131
+ docs = fitz.open(document_path)
132
+ only_text = ""
133
+ for page in docs:
134
+ only_text += page.get_text() + " "
135
+ font_counts, styles = fonts(docs, granularity=False)
136
+ size_tag = font_tags(font_counts, styles)
137
+ elements = headers_para(docs, size_tag)
138
+ paragraphs = []
139
+ for element in elements:
140
+ if len(element) > 100:
141
+ paragraphs.append(element.lower())
142
+ pattern = r'\d+(\.\d+)?\n'
143
+ cleaned_text = re.sub(pattern, '', only_text)
144
+ return cleaned_text.lower(),paragraphs
145
 
146
 
147
  def remove_numbers(words_list: list) -> list: