Spaces:

Pclanglais
/

Editorialization

Running

App Files Files Community

Pclanglais commited on Jul 3

Commit

4c91de3

•

1 Parent(s): 1cf35d9

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -30

app.py CHANGED Viewed

@@ -19,6 +19,27 @@ token_classifier = pipeline(
 tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
 # Preprocess the 'word' column
 def preprocess_text(text):
     # Remove HTML tags
@@ -75,32 +96,24 @@ def split_text(text, max_tokens=500):
     return chunks
 def transform_chunks(marianne_segmentation):
-    print(marianne_segmentation)
     marianne_segmentation = pd.DataFrame(marianne_segmentation)
-    print(marianne_segmentation)
-    # Filter out separators
     marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
-    # Replace '¶' with '\n' and convert to string
     marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
-    #A bit of lceaning.
     marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
-    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != 'nan']
-    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != '']
-    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != ' ']
-    # Add entity_group as a header to each word
-    marianne_segmentation['word'] = '### ' + marianne_segmentation['entity_group'] + ' ###\n' + marianne_segmentation['word']
-    final_text = '\n\n'.join(marianne_segmentation['word'].tolist())
-    return final_text
 # Class to encapsulate the Falcon chatbot
@@ -109,22 +122,17 @@ class MistralChatBot:
         self.system_prompt = system_prompt
     def predict(self, user_message):
-        #We drop the newlines.
-        editorial_text =  re.sub("\n", " ¶ ", user_message)
-        # Tokenize the prompt and check if it exceeds 500 tokens
         num_tokens = len(tokenizer.tokenize(editorial_text))
         if num_tokens > 500:
-            # Split the prompt into chunks
             batch_prompts = split_text(editorial_text, max_tokens=500)
         else:
             batch_prompts = [editorial_text]
         out = token_classifier(batch_prompts)
         out = transform_chunks(out[0])
-        print(out)
-        generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
         return generated_text
 # Create the Falcon chatbot instance

 tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
+css = """
+<style>
+.manuscript {
+    display: flex;
+    margin-bottom: 20px;
+}
+.annotation {
+    width: 30%;
+    padding-right: 20px;
+    color: grey;
+    font-style: italic;
+}
+.content {
+    width: 70%;
+}
+h3 {
+    margin-top: 0;
+}
+</style>
+"""
 # Preprocess the 'word' column
 def preprocess_text(text):
     # Remove HTML tags
     return chunks
 def transform_chunks(marianne_segmentation):
     marianne_segmentation = pd.DataFrame(marianne_segmentation)
     marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
     marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
     marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
+    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]
+    html_output = []
+    for _, row in marianne_segmentation.iterrows():
+        entity_group = row['entity_group']
+        word = row['word']
+        if entity_group == 'title':
+            html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content"><h3>{word}</h3></div></div>')
+        else:
+            html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content">{word}</div></div>')
+    final_html = '\n'.join(html_output)
+    return final_html
 # Class to encapsulate the Falcon chatbot
         self.system_prompt = system_prompt
     def predict(self, user_message):
+        editorial_text = re.sub("\n", " ¶ ", user_message)
         num_tokens = len(tokenizer.tokenize(editorial_text))
         if num_tokens > 500:
             batch_prompts = split_text(editorial_text, max_tokens=500)
         else:
             batch_prompts = [editorial_text]
         out = token_classifier(batch_prompts)
         out = transform_chunks(out[0])
+        generated_text = f'{css}<h2 style="text-align:center">Réponse</h2>\n<div class="generation">{out}</div>'
         return generated_text
 # Create the Falcon chatbot instance