Spaces:
Running
Running
Pclanglais
commited on
Commit
•
4c91de3
1
Parent(s):
1cf35d9
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,27 @@ token_classifier = pipeline(
|
|
19 |
|
20 |
tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Preprocess the 'word' column
|
23 |
def preprocess_text(text):
|
24 |
# Remove HTML tags
|
@@ -75,32 +96,24 @@ def split_text(text, max_tokens=500):
|
|
75 |
return chunks
|
76 |
|
77 |
def transform_chunks(marianne_segmentation):
|
78 |
-
|
79 |
-
print(marianne_segmentation)
|
80 |
-
|
81 |
marianne_segmentation = pd.DataFrame(marianne_segmentation)
|
82 |
-
|
83 |
-
print(marianne_segmentation)
|
84 |
-
|
85 |
-
# Filter out separators
|
86 |
marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
|
87 |
-
|
88 |
-
# Replace '¶' with '\n' and convert to string
|
89 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
|
90 |
-
|
91 |
-
#A bit of lceaning.
|
92 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
102 |
|
103 |
-
|
|
|
104 |
|
105 |
|
106 |
# Class to encapsulate the Falcon chatbot
|
@@ -109,22 +122,17 @@ class MistralChatBot:
|
|
109 |
self.system_prompt = system_prompt
|
110 |
|
111 |
def predict(self, user_message):
|
112 |
-
|
113 |
-
editorial_text = re.sub("\n", " ¶ ", user_message)
|
114 |
-
|
115 |
-
# Tokenize the prompt and check if it exceeds 500 tokens
|
116 |
num_tokens = len(tokenizer.tokenize(editorial_text))
|
117 |
-
|
118 |
if num_tokens > 500:
|
119 |
-
# Split the prompt into chunks
|
120 |
batch_prompts = split_text(editorial_text, max_tokens=500)
|
121 |
else:
|
122 |
batch_prompts = [editorial_text]
|
123 |
-
|
124 |
out = token_classifier(batch_prompts)
|
125 |
out = transform_chunks(out[0])
|
126 |
-
|
127 |
-
generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
|
128 |
return generated_text
|
129 |
|
130 |
# Create the Falcon chatbot instance
|
|
|
19 |
|
20 |
tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
|
21 |
|
22 |
+
css = """
|
23 |
+
<style>
|
24 |
+
.manuscript {
|
25 |
+
display: flex;
|
26 |
+
margin-bottom: 20px;
|
27 |
+
}
|
28 |
+
.annotation {
|
29 |
+
width: 30%;
|
30 |
+
padding-right: 20px;
|
31 |
+
color: grey;
|
32 |
+
font-style: italic;
|
33 |
+
}
|
34 |
+
.content {
|
35 |
+
width: 70%;
|
36 |
+
}
|
37 |
+
h3 {
|
38 |
+
margin-top: 0;
|
39 |
+
}
|
40 |
+
</style>
|
41 |
+
"""
|
42 |
+
|
43 |
# Preprocess the 'word' column
|
44 |
def preprocess_text(text):
|
45 |
# Remove HTML tags
|
|
|
96 |
return chunks
|
97 |
|
98 |
def transform_chunks(marianne_segmentation):
|
|
|
|
|
|
|
99 |
marianne_segmentation = pd.DataFrame(marianne_segmentation)
|
|
|
|
|
|
|
|
|
100 |
marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
|
|
|
|
|
101 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
|
|
|
|
|
102 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
|
103 |
+
marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]
|
104 |
+
|
105 |
+
html_output = []
|
106 |
+
for _, row in marianne_segmentation.iterrows():
|
107 |
+
entity_group = row['entity_group']
|
108 |
+
word = row['word']
|
109 |
+
|
110 |
+
if entity_group == 'title':
|
111 |
+
html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content"><h3>{word}</h3></div></div>')
|
112 |
+
else:
|
113 |
+
html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content">{word}</div></div>')
|
114 |
|
115 |
+
final_html = '\n'.join(html_output)
|
116 |
+
return final_html
|
117 |
|
118 |
|
119 |
# Class to encapsulate the Falcon chatbot
|
|
|
122 |
self.system_prompt = system_prompt
|
123 |
|
124 |
def predict(self, user_message):
|
125 |
+
editorial_text = re.sub("\n", " ¶ ", user_message)
|
|
|
|
|
|
|
126 |
num_tokens = len(tokenizer.tokenize(editorial_text))
|
127 |
+
|
128 |
if num_tokens > 500:
|
|
|
129 |
batch_prompts = split_text(editorial_text, max_tokens=500)
|
130 |
else:
|
131 |
batch_prompts = [editorial_text]
|
132 |
+
|
133 |
out = token_classifier(batch_prompts)
|
134 |
out = transform_chunks(out[0])
|
135 |
+
generated_text = f'{css}<h2 style="text-align:center">Réponse</h2>\n<div class="generation">{out}</div>'
|
|
|
136 |
return generated_text
|
137 |
|
138 |
# Create the Falcon chatbot instance
|