Spaces:

klasocki
/

comma-fixer

Sleeping

App Files Files Community

klasocki commited on Aug 18, 2023

Commit

b1106e6

1 Parent(s): 35c0239

Refactor and add more tests

Browse files

Files changed (4) hide show

app.py +5 -4
src/baseline.py +25 -16
tests/test_baseline.py +9 -2
tests/test_integration.py +2 -2

app.py CHANGED Viewed

@@ -16,14 +16,15 @@ def root():
 @app.route('/baseline/fix-commas/', methods=['POST'])
 def fix_commas_with_baseline():
     data = request.get_json()
-    if 's' in data:
-        return make_response(jsonify({'s': fix_commas(app.baseline_pipeline, data['s'])}), 200)
     else:
-        return make_response("Parameter 's' missing", 400)
 if __name__ == '__main__':
     logger.info("Loading the baseline model.")
     app.baseline_pipeline = create_baseline_pipeline()
-    app.run(debug=True)

 @app.route('/baseline/fix-commas/', methods=['POST'])
 def fix_commas_with_baseline():
+    json_field_name = 's'
     data = request.get_json()
+    if json_field_name in data:
+        return make_response(jsonify({json_field_name: fix_commas(app.baseline_pipeline, data['s'])}), 200)
     else:
+        return make_response(f"Parameter '{json_field_name}' missing", 400)
 if __name__ == '__main__':
     logger.info("Loading the baseline model.")
     app.baseline_pipeline = create_baseline_pipeline()
+    app.run(debug=True) # TODO get this from config or env variable

src/baseline.py CHANGED Viewed

@@ -1,12 +1,19 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
-def create_baseline_pipeline() -> NerPipeline:
-    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
-    model = AutoModelForTokenClassification.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
     return pipeline('ner', model=model, tokenizer=tokenizer)
 def _remove_punctuation(s: str) -> str:
     to_remove = ".,?-:"
     for char in to_remove:
@@ -14,23 +21,25 @@ def _remove_punctuation(s: str) -> str:
     return s
-def _convert_pipeline_json_to_string(pipeline_json: list[dict], original_s: str) -> str:
-    # TODO is it ok to remove redundant spaces, or should we keep input data as is and only touch commas?
-    # TODO don't accept tokens with commas inside words
-    result = original_s.replace(',', '') # We will fix the commas, but keep everything else intact
     current_offset = 0
     for i in range(1, len(pipeline_json)):
-        current_word = pipeline_json[i - 1]['word'].replace('▁', '')
-        current_offset = result.find(current_word, current_offset) + len(current_word)
-        # Only insert commas for the final token of a word
-        if pipeline_json[i - 1]['entity'] == ',' and pipeline_json[i]['word'].startswith('▁'):
             result = result[:current_offset] + ',' + result[current_offset:]
             current_offset += 1
     return result
-def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
-    return _convert_pipeline_json_to_string(
-        ner_pipeline(_remove_punctuation(s)),
-        s
-    )

 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
+def create_baseline_pipeline(model_name="oliverguhr/fullstop-punctuation-multilang-large") -> NerPipeline:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForTokenClassification.from_pretrained(model_name)
     return pipeline('ner', model=model, tokenizer=tokenizer)
+def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
+    return _fix_commas_based_on_pipeline_output(
+        ner_pipeline(_remove_punctuation(s)),
+        s
+    )
 def _remove_punctuation(s: str) -> str:
     to_remove = ".,?-:"
     for char in to_remove:
     return s
+def _fix_commas_based_on_pipeline_output(pipeline_json: list[dict], original_s: str) -> str:
+    result = original_s.replace(',', '')  # We will fix the commas, but keep everything else intact
     current_offset = 0
     for i in range(1, len(pipeline_json)):
+        current_offset = _find_current_token(current_offset, i, pipeline_json, result)
+        if _should_insert_comma(i, pipeline_json):
             result = result[:current_offset] + ',' + result[current_offset:]
             current_offset += 1
     return result
+def _should_insert_comma(i, pipeline_json, new_word_indicator='▁') -> bool:
+    # Only insert commas for the final token of a word
+    return pipeline_json[i - 1]['entity'] == ',' and pipeline_json[i]['word'].startswith(new_word_indicator)
+def _find_current_token(current_offset, i, pipeline_json, result, new_word_indicator='▁') -> int:
+    current_word = pipeline_json[i - 1]['word'].replace(new_word_indicator, '')
+    # Find the current word in the result string, starting looking at current offset
+    current_offset = result.find(current_word, current_offset) + len(current_word)
+    return current_offset

tests/test_baseline.py CHANGED Viewed

@@ -11,7 +11,8 @@ def baseline_pipeline():
     "test_input",
     ['',
      'Hello world.',
-     'This test string should not have any commas inside it.']
 )
 def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_input):
     result = fix_commas(baseline_pipeline, s=test_input)
@@ -23,7 +24,13 @@ def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_inp
     [
         ['I, am.', 'I am.'],
         ['A complex     clause however it misses a comma something else and a dot...?',
-         'A complex     clause, however, it misses a comma, something else and a dot...?']]
 )
 def test_fix_commas_fixes_incorrect_commas(baseline_pipeline, test_input, expected):
     result = fix_commas(baseline_pipeline, s=test_input)

     "test_input",
     ['',
      'Hello world.',
+     'This test string should not have any commas inside it.',
+     'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
 )
 def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_input):
     result = fix_commas(baseline_pipeline, s=test_input)
     [
         ['I, am.', 'I am.'],
         ['A complex     clause however it misses a comma something else and a dot...?',
+         'A complex     clause, however, it misses a comma, something else and a dot...?'],
+        ['a pen an apple, \tand a pineapple!',
+         'a pen, an apple \tand a pineapple!'],
+        ['Even newlines\ntabs\tand others get preserved.',
+         'Even newlines,\ntabs\tand others get preserved.'],
+        ['I had no Creativity left, therefore, I come here, and write useless examples, for this test.',
+         'I had no Creativity left therefore, I come here and write useless examples for this test.']]
 )
 def test_fix_commas_fixes_incorrect_commas(baseline_pipeline, test_input, expected):
     result = fix_commas(baseline_pipeline, s=test_input)

tests/test_integration.py CHANGED Viewed

@@ -29,7 +29,7 @@ def test_fix_commas_fails_on_wrong_parameters(client):
      'Hello world.',
      'This test string should not have any commas inside it.']
 )
-def test_fix_commas_plain_string_unchanged(client, test_input: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
@@ -40,7 +40,7 @@ def test_fix_commas_plain_string_unchanged(client, test_input: str):
     "test_input, expected",
     [['I am, here.', 'I am here.'],
      ['books pens and pencils',
-      'books, pens and pencils.']]
 )
 def test_fix_commas_fixes_wrong_commas(client, test_input: str, expected: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})

      'Hello world.',
      'This test string should not have any commas inside it.']
 )
+def test_fix_commas_correct_string_unchanged(client, test_input: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
     "test_input, expected",
     [['I am, here.', 'I am here.'],
      ['books pens and pencils',
+      'books, pens and pencils']]
 )
 def test_fix_commas_fixes_wrong_commas(client, test_input: str, expected: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})