Spaces:

AIGuardians
/

SummarizeWikipediaDocument

Runtime error

App Files Files Community

g3casey commited on Dec 18, 2022

Commit

36c9b26

1 Parent(s): 8497bb0

Changing to paste in text for input since the wikipedia api doesn't work.

Browse files

Files changed (13) hide show

.idea/.gitignore +8 -0
.idea/aws.xml +11 -0
.idea/inspectionProfiles/Project_Default.xml +14 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/other.xml +7 -0
.idea/summaraize.iml +11 -0
.idea/vcs.xml +6 -0
app.py +4 -4
inference.py +3 -0
summarize_train.py +109 -0
tester.py +21 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/aws.xml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="accountSettings">
+    <option name="activeRegion" value="us-east-1" />
+    <option name="recentlyUsedRegions">
+      <list>
+        <option value="us-east-1" />
+      </list>
+    </option>
+  </component>
+</project>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,14 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N803" />
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/summaraize.iml" filepath="$PROJECT_DIR$/.idea/summaraize.iml" />
+    </modules>
+  </component>
+</project>

.idea/other.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_SCI_VIEW" value="true" />
+    <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
+  </component>
+</project>

.idea/summaraize.iml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="renderExternalDocumentation" value="true" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ def get_wiki(search_term):
     orig_text_len = len(text)
     text = summarize(text)
     sum_length = len(text)
-    return [text,orig_text_len,sum_length]
 # def inference(file):
@@ -48,10 +48,10 @@ out_orig_test_len = gr.Number(label='Original Text Length')
 out_sum_text_len = gr.Number(label='Summarized Text Length')
 iface = gr.Interface(fn=get_wiki,
-                     inputs=gr.Textbox(lines=2, placeholder="Wikipedia search term here...", label='Search Term'),
                      outputs=[out_sum_text,out_orig_test_len,out_sum_text_len],
-                     title='Wikipedia Article Summary',
-                     description='Enter a search term to get a wikipedia article associated with it. Then we will summarize the article found. ',
                      sample_inputs='guardians of the galaxy'
 )
 iface.launch()  # To create a public link, set `share=True` in `launch()`.

     orig_text_len = len(text)
     text = summarize(text)
     sum_length = len(text)
+    return [text, orig_text_len, sum_length]
 # def inference(file):
 out_sum_text_len = gr.Number(label='Summarized Text Length')
 iface = gr.Interface(fn=get_wiki,
+                     inputs=gr.Textbox(lines=50, placeholder="Wikipedia search term here...", label='Search Term'),
                      outputs=[out_sum_text,out_orig_test_len,out_sum_text_len],
+                     title='Article Summary',
+                     description='Paste in an article and it will be summarized',
                      sample_inputs='guardians of the galaxy'
 )
 iface.launch()  # To create a public link, set `share=True` in `launch()`.

inference.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from transformers import AutoModelForSeq2SeqLM
2	+
3	+ model = AutoModelForSeq2SeqLM.from_pretrained("sgugger/my-awesome-model")

summarize_train.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import transformers
+from datasets import load_dataset, load_metric
+import datasets
+import random
+import pandas as pd
+from IPython.display import display, HTML
+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
+model_checkpoint = "t5-small"
+raw_datasets = load_dataset("xsum")
+metric = load_metric("rouge")
+def show_random_elements(dataset, num_examples=5):
+    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
+    picks = []
+    for _ in range(num_examples):
+        pick = random.randint(0, len(dataset) - 1)
+        while pick in picks:
+            pick = random.randint(0, len(dataset) - 1)
+        picks.append(pick)
+    df = pd.DataFrame(dataset[picks])
+    for column, typ in dataset.features.items():
+        if isinstance(typ, datasets.ClassLabel):
+            df[column] = df[column].transform(lambda i: typ.names[i])
+    display(HTML(df.to_html()))
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+print(transformers.__version__)
+if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
+    prefix = "summarize: "
+else:
+    prefix = ""
+max_input_length = 1024
+max_target_length = 128
+def preprocess_function(examples):
+    inputs = [prefix + doc for doc in examples["document"]]
+    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
+    # Setup the tokenizer for targets
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+batch_size = 16
+model_name = model_checkpoint.split("/")[-1]
+args = Seq2SeqTrainingArguments(
+    f"{model_name}-finetuned-xsum",
+    evaluation_strategy = "epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    weight_decay=0.01,
+    save_total_limit=3,
+    num_train_epochs=1,
+    predict_with_generate=True,
+    fp16=True,
+    push_to_hub=True,
+)
+import nltk
+import numpy as np
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    # Replace -100 in the labels as we can't decode them.
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    # Rouge expects a newline after each sentence
+    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
+    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
+    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+    # Extract a few results
+    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+    # Add mean generated length
+    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+    result["gen_len"] = np.mean(prediction_lens)
+    return {k: round(v, 4) for k, v in result.items()}
+trainer = Seq2SeqTrainer(
+    model,
+    args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["validation"],
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics
+)

tester.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import wikipedia
+def search_wiki(text):
+    article_list = wikipedia.search(text)
+    wikipedia.page(article_list[0])
+def get_wiki(search_term):
+    return wikipedia.page(search_term)
+# src = search_wiki('spacex')
+get = get_wiki('spacex')
+# print(src)
+print(get)
+print(wikipedia.summary("Python Programming Language"))
+x = search_wiki('spacex')
+print('done')