kasper-boy commited on
Commit
9d3d366
·
verified ·
1 Parent(s): 2237880

Upload 163 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +6 -6
  2. __pycache__/multipage.cpython-37.pyc +0 -0
  3. app_pages/__pycache__/about.cpython-37.pyc +0 -0
  4. app_pages/__pycache__/home.cpython-37.pyc +0 -0
  5. app_pages/__pycache__/ocr_comparator.cpython-37.pyc +0 -0
  6. app_pages/about.py +37 -0
  7. app_pages/home.py +19 -0
  8. app_pages/img_demo_1.jpg +0 -0
  9. app_pages/img_demo_2.jpg +0 -0
  10. app_pages/ocr.png +0 -0
  11. app_pages/ocr_comparator.py +1447 -0
  12. configs/_base_/default_runtime.py +17 -0
  13. configs/_base_/det_datasets/ctw1500.py +18 -0
  14. configs/_base_/det_datasets/icdar2015.py +18 -0
  15. configs/_base_/det_datasets/icdar2017.py +18 -0
  16. configs/_base_/det_datasets/synthtext.py +18 -0
  17. configs/_base_/det_datasets/toy_data.py +41 -0
  18. configs/_base_/det_models/dbnet_r18_fpnc.py +21 -0
  19. configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py +23 -0
  20. configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py +28 -0
  21. configs/_base_/det_models/drrg_r50_fpn_unet.py +21 -0
  22. configs/_base_/det_models/fcenet_r50_fpn.py +33 -0
  23. configs/_base_/det_models/fcenet_r50dcnv2_fpn.py +35 -0
  24. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py +126 -0
  25. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py +126 -0
  26. configs/_base_/det_models/panet_r18_fpem_ffm.py +43 -0
  27. configs/_base_/det_models/panet_r50_fpem_ffm.py +21 -0
  28. configs/_base_/det_models/psenet_r50_fpnf.py +51 -0
  29. configs/_base_/det_models/textsnake_r50_fpn_unet.py +22 -0
  30. configs/_base_/det_pipelines/dbnet_pipeline.py +88 -0
  31. configs/_base_/det_pipelines/drrg_pipeline.py +60 -0
  32. configs/_base_/det_pipelines/fcenet_pipeline.py +118 -0
  33. configs/_base_/det_pipelines/maskrcnn_pipeline.py +57 -0
  34. configs/_base_/det_pipelines/panet_pipeline.py +156 -0
  35. configs/_base_/det_pipelines/psenet_pipeline.py +70 -0
  36. configs/_base_/det_pipelines/textsnake_pipeline.py +65 -0
  37. configs/_base_/recog_datasets/MJ_train.py +21 -0
  38. configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py +31 -0
  39. configs/_base_/recog_datasets/ST_MJ_train.py +29 -0
  40. configs/_base_/recog_datasets/ST_SA_MJ_real_train.py +81 -0
  41. configs/_base_/recog_datasets/ST_SA_MJ_train.py +48 -0
  42. configs/_base_/recog_datasets/ST_charbox_train.py +23 -0
  43. configs/_base_/recog_datasets/academic_test.py +57 -0
  44. configs/_base_/recog_datasets/seg_toy_data.py +34 -0
  45. configs/_base_/recog_datasets/toy_data.py +54 -0
  46. configs/_base_/recog_models/abinet.py +70 -0
  47. configs/_base_/recog_models/crnn.py +12 -0
  48. configs/_base_/recog_models/crnn_tps.py +18 -0
  49. configs/_base_/recog_models/master.py +61 -0
  50. configs/_base_/recog_models/nrtr_modality_transform.py +11 -0
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: Streamlit OCR
3
- emoji: 🌖
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
1
  ---
2
  title: Streamlit OCR
3
+ emoji:
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: ocr_streamlit.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
__pycache__/multipage.cpython-37.pyc ADDED
Binary file (2.65 kB). View file
 
app_pages/__pycache__/about.cpython-37.pyc ADDED
Binary file (2.02 kB). View file
 
app_pages/__pycache__/home.cpython-37.pyc ADDED
Binary file (889 Bytes). View file
 
app_pages/__pycache__/ocr_comparator.cpython-37.pyc ADDED
Binary file (48.1 kB). View file
 
app_pages/about.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+ st.title("OCR solutions comparator")
5
+
6
+ st.write("")
7
+ st.write("")
8
+ st.write("")
9
+
10
+ st.markdown("##### This app allows you to compare, from a given picture, the results of different solutions:")
11
+ st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
12
+ st.write("")
13
+ st.write("")
14
+
15
+ st.markdown(''' The 1st step is to choose the language for the text recognition (not all solutions \
16
+ support the same languages), and then choose the picture to consider. It is possible to upload a file, \
17
+ to take a picture, or to use a demo file. \
18
+ It is then possible to change the default values for the text area detection process, \
19
+ before launching the detection task for each solution.''')
20
+ st.write("")
21
+
22
+ st.markdown(''' The different results are then presented. The 2nd step is to choose one of these \
23
+ detection results, in order to carry out the text recognition process there. It is also possible to change \
24
+ the default settings for each solution.''')
25
+ st.write("")
26
+
27
+ st.markdown("###### The recognition results appear in 2 formats:")
28
+ st.markdown(''' - a visual format resumes the initial image, replacing the detected areas with \
29
+ the recognized text. The background is + or - strongly colored in green according to the \
30
+ confidence level of the recognition.
31
+ A slider allows you to change the font size, another \
32
+ allows you to modify the confidence threshold above which the text color changes: if it is at \
33
+ 70% for example, then all the texts with a confidence threshold higher or equal to 70 will appear \
34
+ in white, in black otherwise.''')
35
+
36
+ st.markdown(" - a detailed format presents the results in a table, for each text box detected. \
37
+ It is possible to download this results in a local csv file.")
app_pages/home.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+ st.image('ocr.png')
5
+
6
+ st.write("")
7
+
8
+ st.markdown('''#### OCR, or Optical Character Recognition, is a computer vision task, \
9
+ which includes the detection of text areas, and the recognition of characters.''')
10
+ st.write("")
11
+ st.write("")
12
+
13
+ st.markdown("##### This app allows you to compare, from a given image, the results of different solutions:")
14
+ st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
15
+ st.write("")
16
+ st.write("")
17
+ st.markdown("👈 Select the **About** page from the sidebar for information on how the app works")
18
+
19
+ st.markdown("👈 or directly select the **App** page")
app_pages/img_demo_1.jpg ADDED
app_pages/img_demo_2.jpg ADDED
app_pages/ocr.png ADDED
app_pages/ocr_comparator.py ADDED
@@ -0,0 +1,1447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This Streamlit app allows you to compare, from a given image, the results of different solutions:
2
+ EasyOcr, PaddleOCR, MMOCR, Tesseract
3
+ """
4
+
5
+ #import mim
6
+ #
7
+ #mim.install(['mmengine>=0.7.1,<1.1.0'])
8
+ #mim.install(['mmcv>=2.0.0rc4,<2.1.0'])
9
+ #mim.install(['mmdet>=3.0.rc5,<3.2.0'])
10
+ #mim.install(['mmocr'])
11
+
12
+ import streamlit as st
13
+ import plotly.express as px
14
+ import numpy as np
15
+ import math
16
+ import pandas as pd
17
+ from time import sleep
18
+
19
+ import cv2
20
+ from PIL import Image, ImageColor
21
+ import PIL
22
+ import easyocr
23
+ from paddleocr import PaddleOCR
24
+ #from mmocr.utils.ocr import MMOCR
25
+ import pytesseract
26
+ from pytesseract import Output
27
+ import os
28
+ from mycolorpy import colorlist as mcp
29
+
30
+
31
+ ###################################################################################################
32
+ ## MAIN
33
+ ###################################################################################################
34
+ def app():
35
+
36
+ ###################################################################################################
37
+ ## FUNCTIONS
38
+ ###################################################################################################
39
+
40
+ @st.cache
41
+ def convert_df(in_df):
42
+ """Convert data frame function, used by download button
43
+
44
+ Args:
45
+ in_df (data frame): data frame to convert
46
+
47
+ Returns:
48
+ data frame: converted data frame
49
+ """
50
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
51
+ return in_df.to_csv().encode('utf-8')
52
+
53
+ ###
54
+ def easyocr_coord_convert(in_list_coord):
55
+ """Convert easyocr coordinates to standard format used by others functions
56
+
57
+ Args:
58
+ in_list_coord (list of numbers): format [x_min, x_max, y_min, y_max]
59
+
60
+ Returns:
61
+ list of lists: format [ [x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max] ]
62
+ """
63
+
64
+ coord = in_list_coord
65
+ return [[coord[0], coord[2]], [coord[1], coord[2]], [coord[1], coord[3]], [coord[0], coord[3]]]
66
+
67
+ ###
68
+ @st.cache(show_spinner=False)
69
+ def initializations():
70
+ """Initializations for the app
71
+
72
+ Returns:
73
+ list of strings : list of OCR solutions names
74
+ (['EasyOCR', 'PPOCR', 'MMOCR', 'Tesseract'])
75
+ dict : names and indices of the OCR solutions
76
+ ({'EasyOCR': 0, 'PPOCR': 1, 'MMOCR': 2, 'Tesseract': 3})
77
+ list of dicts : list of languages supported by each OCR solution
78
+ list of int : columns for recognition details results
79
+ dict : confidence color scale
80
+ plotly figure : confidence color scale figure
81
+ """
82
+ # the readers considered
83
+ #out_reader_type_list = ['EasyOCR', 'PPOCR', 'MMOCR', 'Tesseract']
84
+ #out_reader_type_dict = {'EasyOCR': 0, 'PPOCR': 1, 'MMOCR': 2, 'Tesseract': 3}
85
+ out_reader_type_list = ['EasyOCR', 'PPOCR', 'Tesseract']
86
+ out_reader_type_dict = {'EasyOCR': 0, 'PPOCR': 1, 'Tesseract': 2}
87
+
88
+ # Columns for recognition details results
89
+ out_cols_size = [2] + [2,1]*(len(out_reader_type_list)-1) # Except Tesseract
90
+
91
+ # Dicts of laguages supported by each reader
92
+ out_dict_lang_easyocr = {'Abaza': 'abq', 'Adyghe': 'ady', 'Afrikaans': 'af', 'Angika': 'ang', \
93
+ 'Arabic': 'ar', 'Assamese': 'as', 'Avar': 'ava', 'Azerbaijani': 'az', 'Belarusian': 'be', \
94
+ 'Bulgarian': 'bg', 'Bihari': 'bh', 'Bhojpuri': 'bho', 'Bengali': 'bn', 'Bosnian': 'bs', \
95
+ 'Simplified Chinese': 'ch_sim', 'Traditional Chinese': 'ch_tra', 'Chechen': 'che', \
96
+ 'Czech': 'cs', 'Welsh': 'cy', 'Danish': 'da', 'Dargwa': 'dar', 'German': 'de', \
97
+ 'English': 'en', 'Spanish': 'es', 'Estonian': 'et', 'Persian (Farsi)': 'fa', 'French': 'fr', \
98
+ 'Irish': 'ga', 'Goan Konkani': 'gom', 'Hindi': 'hi', 'Croatian': 'hr', 'Hungarian': 'hu', \
99
+ 'Indonesian': 'id', 'Ingush': 'inh', 'Icelandic': 'is', 'Italian': 'it', 'Japanese': 'ja', \
100
+ 'Kabardian': 'kbd', 'Kannada': 'kn', 'Korean': 'ko', 'Kurdish': 'ku', 'Latin': 'la', \
101
+ 'Lak': 'lbe', 'Lezghian': 'lez', 'Lithuanian': 'lt', 'Latvian': 'lv', 'Magahi': 'mah', \
102
+ 'Maithili': 'mai', 'Maori': 'mi', 'Mongolian': 'mn', 'Marathi': 'mr', 'Malay': 'ms', \
103
+ 'Maltese': 'mt', 'Nepali': 'ne', 'Newari': 'new', 'Dutch': 'nl', 'Norwegian': 'no', \
104
+ 'Occitan': 'oc', 'Pali': 'pi', 'Polish': 'pl', 'Portuguese': 'pt', 'Romanian': 'ro', \
105
+ 'Russian': 'ru', 'Serbian (cyrillic)': 'rs_cyrillic', 'Serbian (latin)': 'rs_latin', \
106
+ 'Nagpuri': 'sck', 'Slovak': 'sk', 'Slovenian': 'sl', 'Albanian': 'sq', 'Swedish': 'sv', \
107
+ 'Swahili': 'sw', 'Tamil': 'ta', 'Tabassaran': 'tab', 'Telugu': 'te', 'Thai': 'th', \
108
+ 'Tajik': 'tjk', 'Tagalog': 'tl', 'Turkish': 'tr', 'Uyghur': 'ug', 'Ukranian': 'uk', \
109
+ 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi'}
110
+
111
+ out_dict_lang_ppocr = {'Abaza': 'abq', 'Adyghe': 'ady', 'Afrikaans': 'af', 'Albanian': 'sq', \
112
+ 'Angika': 'ang', 'Arabic': 'ar', 'Avar': 'ava', 'Azerbaijani': 'az', 'Belarusian': 'be', \
113
+ 'Bhojpuri': 'bho','Bihari': 'bh','Bosnian': 'bs','Bulgarian': 'bg','Chinese & English': 'ch', \
114
+ 'Chinese Traditional': 'chinese_cht', 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', \
115
+ 'Dargwa': 'dar', 'Dutch': 'nl', 'English': 'en', 'Estonian': 'et', 'French': 'fr', \
116
+ 'German': 'german','Goan Konkani': 'gom','Hindi': 'hi','Hungarian': 'hu','Icelandic': 'is', \
117
+ 'Indonesian': 'id', 'Ingush': 'inh', 'Irish': 'ga', 'Italian': 'it', 'Japan': 'japan', \
118
+ 'Kabardian': 'kbd', 'Korean': 'korean', 'Kurdish': 'ku', 'Lak': 'lbe', 'Latvian': 'lv', \
119
+ 'Lezghian': 'lez', 'Lithuanian': 'lt', 'Magahi': 'mah', 'Maithili': 'mai', 'Malay': 'ms', \
120
+ 'Maltese': 'mt', 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nagpur': 'sck', \
121
+ 'Nepali': 'ne', 'Newari': 'new', 'Norwegian': 'no', 'Occitan': 'oc', 'Persian': 'fa', \
122
+ 'Polish': 'pl', 'Portuguese': 'pt', 'Romanian': 'ro', 'Russia': 'ru', 'Saudi Arabia': 'sa', \
123
+ 'Serbian(cyrillic)': 'rs_cyrillic', 'Serbian(latin)': 'rs_latin', 'Slovak': 'sk', \
124
+ 'Slovenian': 'sl', 'Spanish': 'es', 'Swahili': 'sw', 'Swedish': 'sv', 'Tabassaran': 'tab', \
125
+ 'Tagalog': 'tl', 'Tamil': 'ta', 'Telugu': 'te', 'Turkish': 'tr', 'Ukranian': 'uk', \
126
+ 'Urdu': 'ur', 'Uyghur': 'ug', 'Uzbek': 'uz', 'Vietnamese': 'vi', 'Welsh': 'cy'}
127
+
128
+ #out_dict_lang_mmocr = {'English & Chinese': 'en'}
129
+
130
+ out_dict_lang_tesseract = {'Afrikaans': 'afr','Albanian': 'sqi','Amharic': 'amh', \
131
+ 'Arabic': 'ara', 'Armenian': 'hye','Assamese': 'asm','Azerbaijani - Cyrilic': 'aze_cyrl', \
132
+ 'Azerbaijani': 'aze', 'Basque': 'eus','Belarusian': 'bel','Bengali': 'ben','Bosnian': 'bos', \
133
+ 'Breton': 'bre', 'Bulgarian': 'bul','Burmese': 'mya','Catalan; Valencian': 'cat', \
134
+ 'Cebuano': 'ceb', 'Central Khmer': 'khm','Cherokee': 'chr','Chinese - Simplified': 'chi_sim', \
135
+ 'Chinese - Traditional': 'chi_tra','Corsican': 'cos','Croatian': 'hrv','Czech': 'ces', \
136
+ 'Danish':'dan','Dutch; Flemish':'nld','Dzongkha':'dzo','English, Middle (1100-1500)':'enm', \
137
+ 'English': 'eng','Esperanto': 'epo','Estonian': 'est','Faroese': 'fao', \
138
+ 'Filipino (old - Tagalog)': 'fil','Finnish': 'fin','French, Middle (ca.1400-1600)': 'frm', \
139
+ 'French': 'fra','Galician': 'glg','Georgian - Old': 'kat_old','Georgian': 'kat', \
140
+ 'German - Fraktur': 'frk','German': 'deu','Greek, Modern (1453-)': 'ell','Gujarati': 'guj', \
141
+ 'Haitian; Haitian Creole': 'hat','Hebrew': 'heb','Hindi': 'hin','Hungarian': 'hun', \
142
+ 'Icelandic': 'isl','Indonesian': 'ind','Inuktitut': 'iku','Irish': 'gle', \
143
+ 'Italian - Old': 'ita_old','Italian': 'ita','Japanese': 'jpn','Javanese': 'jav', \
144
+ 'Kannada': 'kan','Kazakh': 'kaz','Kirghiz; Kyrgyz': 'kir','Korean (vertical)': 'kor_vert', \
145
+ 'Korean': 'kor','Kurdish (Arabic Script)': 'kur_ara','Lao': 'lao','Latin': 'lat', \
146
+ 'Latvian':'lav','Lithuanian':'lit','Luxembourgish':'ltz','Macedonian':'mkd','Malay':'msa', \
147
+ 'Malayalam': 'mal','Maltese': 'mlt','Maori': 'mri','Marathi': 'mar','Mongolian': 'mon', \
148
+ 'Nepali': 'nep','Norwegian': 'nor','Occitan (post 1500)': 'oci', \
149
+ 'Orientation and script detection module':'osd','Oriya':'ori','Panjabi; Punjabi':'pan', \
150
+ 'Persian':'fas','Polish':'pol','Portuguese':'por','Pushto; Pashto':'pus','Quechua':'que', \
151
+ 'Romanian; Moldavian; Moldovan': 'ron','Russian': 'rus','Sanskrit': 'san', \
152
+ 'Scottish Gaelic': 'gla','Serbian - Latin': 'srp_latn','Serbian': 'srp','Sindhi': 'snd', \
153
+ 'Sinhala; Sinhalese': 'sin','Slovak': 'slk','Slovenian': 'slv', \
154
+ 'Spanish; Castilian - Old': 'spa_old','Spanish; Castilian': 'spa','Sundanese': 'sun', \
155
+ 'Swahili': 'swa','Swedish': 'swe','Syriac': 'syr','Tajik': 'tgk','Tamil': 'tam', \
156
+ 'Tatar':'tat','Telugu':'tel','Thai':'tha','Tibetan':'bod','Tigrinya':'tir','Tonga':'ton', \
157
+ 'Turkish': 'tur','Uighur; Uyghur': 'uig','Ukrainian': 'ukr','Urdu': 'urd', \
158
+ 'Uzbek - Cyrilic': 'uzb_cyrl','Uzbek': 'uzb','Vietnamese': 'vie','Welsh': 'cym', \
159
+ 'Western Frisian': 'fry','Yiddish': 'yid','Yoruba': 'yor'}
160
+
161
+ out_list_dict_lang = [out_dict_lang_easyocr, out_dict_lang_ppocr, \
162
+ #out_dict_lang_mmocr, \
163
+ out_dict_lang_tesseract]
164
+
165
+ # Initialization of detection form
166
+ if 'columns_size' not in st.session_state:
167
+ st.session_state.columns_size = [2] + [1 for x in out_reader_type_list[1:]]
168
+ if 'column_width' not in st.session_state:
169
+ st.session_state.column_width = [400] + [300 for x in out_reader_type_list[1:]]
170
+ if 'columns_color' not in st.session_state:
171
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
172
+ ["rgb(79, 43, 255)" for x in out_reader_type_list[1:]]
173
+ if 'list_coordinates' not in st.session_state:
174
+ st.session_state.list_coordinates = []
175
+
176
+ # Confidence color scale
177
+ out_list_confid = list(np.arange(0,101,1))
178
+ out_list_grad = mcp.gen_color_normalized(cmap="Greens",data_arr=np.array(out_list_confid))
179
+ out_dict_back_colors = {out_list_confid[i]: out_list_grad[i] \
180
+ for i in range(len(out_list_confid))}
181
+
182
+ list_y = [1 for i in out_list_confid]
183
+ df_confid = pd.DataFrame({'% confidence scale': out_list_confid, 'y': list_y})
184
+
185
+ out_fig = px.scatter(df_confid, x='% confidence scale', y='y', \
186
+ hover_data={'% confidence scale': True, 'y': False},
187
+ color=out_dict_back_colors.values(), range_y=[0.9,1.1], range_x=[0,100],
188
+ color_discrete_map="identity",height=50,symbol='y',symbol_sequence=['square'])
189
+ out_fig.update_xaxes(showticklabels=False)
190
+ out_fig.update_yaxes(showticklabels=False, range=[0.1, 1.1], visible=False)
191
+ out_fig.update_traces(marker_size=50)
192
+ out_fig.update_layout(paper_bgcolor="white", margin=dict(b=0,r=0,t=0,l=0), xaxis_side="top", \
193
+ showlegend=False)
194
+
195
+ return out_reader_type_list, out_reader_type_dict, out_list_dict_lang, \
196
+ out_cols_size, out_dict_back_colors, out_fig
197
+
198
+ ###
199
+ @st.experimental_memo(show_spinner=False)
200
+ def init_easyocr(in_params):
201
+ """Initialization of easyOCR reader
202
+
203
+ Args:
204
+ in_params (list): list with the language
205
+
206
+ Returns:
207
+ easyocr reader: the easyocr reader instance
208
+ """
209
+ out_ocr = easyocr.Reader(in_params)
210
+ return out_ocr
211
+
212
+ ###
213
+ @st.cache(show_spinner=False)
214
+ def init_ppocr(in_params):
215
+ """Initialization of PPOCR reader
216
+
217
+ Args:
218
+ in_params (dict): dict with parameters
219
+
220
+ Returns:
221
+ ppocr reader: the ppocr reader instance
222
+ """
223
+ out_ocr = PaddleOCR(lang=in_params[0], **in_params[1])
224
+ return out_ocr
225
+
226
+ ###
227
+ #@st.experimental_memo(show_spinner=False)
228
+ #def init_mmocr(in_params):
229
+ # """Initialization of MMOCR reader
230
+ #
231
+ # Args:
232
+ # in_params (dict): dict with parameters
233
+ #
234
+ # Returns:
235
+ # mmocr reader: the ppocr reader instance
236
+ # """
237
+ # out_ocr = MMOCR(recog=None, **in_params[1])
238
+ # return out_ocr
239
+
240
+ ###
241
+ def init_readers(in_list_params):
242
+ """Initialization of the readers, and return them as list
243
+
244
+ Args:
245
+ in_list_params (list): list of dicts of parameters for each reader
246
+
247
+ Returns:
248
+ list: list of the reader's instances
249
+ """
250
+ # Instantiations of the readers :
251
+ # - EasyOCR
252
+ with st.spinner("EasyOCR reader initialization in progress ..."):
253
+ reader_easyocr = init_easyocr([in_list_params[0][0]])
254
+
255
+ # - PPOCR
256
+ # Paddleocr
257
+ with st.spinner("PPOCR reader initialization in progress ..."):
258
+ reader_ppocr = init_ppocr(in_list_params[1])
259
+
260
+ # - MMOCR
261
+ #with st.spinner("MMOCR reader initialization in progress ..."):
262
+ # reader_mmocr = init_mmocr(in_list_params[2])
263
+
264
+ out_list_readers = [reader_easyocr, reader_ppocr] #, reader_mmocr]
265
+
266
+ return out_list_readers
267
+
268
+ ###
269
+ def load_image(in_image_file):
270
+ """Load input file and open it
271
+
272
+ Args:
273
+ in_image_file (string or Streamlit UploadedFile): image to consider
274
+
275
+ Returns:
276
+ string : locally saved image path (img.)
277
+ PIL.Image : input file opened with Pillow
278
+ matrix : input file opened with Opencv
279
+ """
280
+
281
+ #if isinstance(in_image_file, str):
282
+ # out_image_path = "img."+in_image_file.split('.')[-1]
283
+ #else:
284
+ # out_image_path = "img."+in_image_file.name.split('.')[-1]
285
+
286
+ if isinstance(in_image_file, str):
287
+ out_image_path = "tmp_"+in_image_file
288
+ else:
289
+ out_image_path = "tmp_"+in_image_file.name
290
+
291
+ img = Image.open(in_image_file)
292
+ img_saved = img.save(out_image_path)
293
+
294
+ # Read image
295
+ out_image_orig = Image.open(out_image_path)
296
+ out_image_cv2 = cv2.cvtColor(cv2.imread(out_image_path), cv2.COLOR_BGR2RGB)
297
+
298
+ return out_image_path, out_image_orig, out_image_cv2
299
+
300
+ ###
301
+ @st.experimental_memo(show_spinner=False)
302
+ def easyocr_detect(_in_reader, in_image_path, in_params):
303
+ """Detection with EasyOCR
304
+
305
+ Args:
306
+ _in_reader (EasyOCR reader) : the previously initialized instance
307
+ in_image_path (string ) : locally saved image path
308
+ in_params (list) : list with the parameters for detection
309
+
310
+ Returns:
311
+ list : list of the boxes coordinates
312
+ exception on error, string 'OK' otherwise
313
+ """
314
+ try:
315
+ dict_param = in_params[1]
316
+ detection_result = _in_reader.detect(in_image_path,
317
+ #width_ths=0.7,
318
+ #mag_ratio=1.5
319
+ **dict_param
320
+ )
321
+ easyocr_coordinates = detection_result[0][0]
322
+
323
+ # The format of the coordinate is as follows: [x_min, x_max, y_min, y_max]
324
+ # Format boxes coordinates for draw
325
+ out_easyocr_boxes_coordinates = list(map(easyocr_coord_convert, easyocr_coordinates))
326
+ out_status = 'OK'
327
+ except Exception as e:
328
+ out_easyocr_boxes_coordinates = []
329
+ out_status = e
330
+
331
+ return out_easyocr_boxes_coordinates, out_status
332
+
333
+ ###
334
+ @st.experimental_memo(show_spinner=False)
335
+ def ppocr_detect(_in_reader, in_image_path):
336
+ """Detection with PPOCR
337
+
338
+ Args:
339
+ _in_reader (PPOCR reader) : the previously initialized instance
340
+ in_image_path (string ) : locally saved image path
341
+
342
+ Returns:
343
+ list : list of the boxes coordinates
344
+ exception on error, string 'OK' otherwise
345
+ """
346
+ # PPOCR detection method
347
+ try:
348
+ out_ppocr_boxes_coordinates = _in_reader.ocr(in_image_path, rec=False)
349
+ out_status = 'OK'
350
+ except Exception as e:
351
+ out_ppocr_boxes_coordinates = []
352
+ out_status = e
353
+
354
+ return out_ppocr_boxes_coordinates, out_status
355
+
356
+ ###
357
+ #@st.experimental_memo(show_spinner=False)
358
+ #def mmocr_detect(_in_reader, in_image_path):
359
+ # """Detection with MMOCR
360
+ #
361
+ # Args:
362
+ # _in_reader (EasyORC reader) : the previously initialized instance
363
+ # in_image_path (string) : locally saved image path
364
+ # in_params (list) : list with the parameters
365
+ #
366
+ # Returns:
367
+ # list : list of the boxes coordinates
368
+ # exception on error, string 'OK' otherwise
369
+ # """
370
+ # # MMOCR detection method
371
+ # out_mmocr_boxes_coordinates = []
372
+ # try:
373
+ # det_result = _in_reader.readtext(in_image_path, details=True)
374
+ # bboxes_list = [res['boundary_result'] for res in det_result]
375
+ # for bboxes in bboxes_list:
376
+ # for bbox in bboxes:
377
+ # if len(bbox) > 9:
378
+ # min_x = min(bbox[0:-1:2])
379
+ # min_y = min(bbox[1:-1:2])
380
+ # max_x = max(bbox[0:-1:2])
381
+ # max_y = max(bbox[1:-1:2])
382
+ # #box = [min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y]
383
+ # else:
384
+ # min_x = min(bbox[0:-1:2])
385
+ # min_y = min(bbox[1::2])
386
+ # max_x = max(bbox[0:-1:2])
387
+ # max_y = max(bbox[1::2])
388
+ # box4 = [ [min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y] ]
389
+ # out_mmocr_boxes_coordinates.append(box4)
390
+ # out_status = 'OK'
391
+ # except Exception as e:
392
+ # out_status = e
393
+ #
394
+ # return out_mmocr_boxes_coordinates, out_status
395
+
396
+ ###
397
+ def cropped_1box(in_box, in_img):
398
+ """Construction of an cropped image corresponding to an area of the initial image
399
+
400
+ Args:
401
+ in_box (list) : box with coordinates
402
+ in_img (matrix) : image
403
+
404
+ Returns:
405
+ matrix : cropped image
406
+ """
407
+ box_ar = np.array(in_box).astype(np.int64)
408
+ x_min = box_ar[:, 0].min()
409
+ x_max = box_ar[:, 0].max()
410
+ y_min = box_ar[:, 1].min()
411
+ y_max = box_ar[:, 1].max()
412
+ out_cropped = in_img[y_min:y_max, x_min:x_max]
413
+
414
+ return out_cropped
415
+
416
+ ###
417
+ @st.experimental_memo(show_spinner=False)
418
+ def tesserocr_detect(in_image_path, _in_img, in_params):
419
+ """Detection with Tesseract
420
+
421
+ Args:
422
+ in_image_path (string) : locally saved image path
423
+ _in_img (PIL.Image) : image to consider
424
+ in_params (list) : list with the parameters for detection
425
+
426
+ Returns:
427
+ list : list of the boxes coordinates
428
+ exception on error, string 'OK' otherwise
429
+ """
430
+ try:
431
+ dict_param = in_params[1]
432
+ df_res = pytesseract.image_to_data(_in_img, **dict_param, output_type=Output.DATAFRAME)
433
+
434
+ df_res['box'] = df_res.apply(lambda d: [[d['left'], d['top']], \
435
+ [d['left'] + d['width'], d['top']], \
436
+ [d['left'] + d['width'], d['top'] + d['height']], \
437
+ [d['left'], d['top'] + d['height']], \
438
+ ], axis=1)
439
+ out_tesserocr_boxes_coordinates = df_res[df_res.word_num > 0]['box'].to_list()
440
+ out_status = 'OK'
441
+ except Exception as e:
442
+ out_tesserocr_boxes_coordinates = []
443
+ out_status = e
444
+
445
+ return out_tesserocr_boxes_coordinates, out_status
446
+
447
+ ###
448
+ @st.experimental_memo(show_spinner=False)
449
+ def process_detect(in_image_path, _in_list_images, _in_list_readers, in_list_params, in_color):
450
+ """Detection process for each OCR solution
451
+
452
+ Args:
453
+ in_image_path (string) : locally saved image path
454
+ _in_list_images (list) : list of original image
455
+ _in_list_readers (list) : list with previously initialized reader's instances
456
+ in_list_params (list) : list with dict parameters for each OCR solution
457
+ in_color (tuple) : color for boxes around text
458
+
459
+ Returns:
460
+ list: list of detection results images
461
+ list: list of boxes coordinates
462
+ """
463
+ ## ------- EasyOCR Text detection
464
+ with st.spinner('EasyOCR Text detection in progress ...'):
465
+ easyocr_boxes_coordinates,easyocr_status = easyocr_detect(_in_list_readers[0], \
466
+ in_image_path, in_list_params[0])
467
+ # Visualization
468
+ if easyocr_boxes_coordinates:
469
+ easyocr_image_detect = draw_detected(_in_list_images[0], easyocr_boxes_coordinates, \
470
+ in_color, 'None', 3)
471
+ else:
472
+ easyocr_boxes_coordinates = easyocr_status
473
+ ##
474
+
475
+ ## ------- PPOCR Text detection
476
+ with st.spinner('PPOCR Text detection in progress ...'):
477
+ list_ppocr_boxes_coordinates, ppocr_status = ppocr_detect(_in_list_readers[1], in_image_path)
478
+ ppocr_boxes_coordinates = list_ppocr_boxes_coordinates[0]
479
+ # Visualization
480
+ if ppocr_boxes_coordinates:
481
+ ppocr_image_detect = draw_detected(_in_list_images[0], ppocr_boxes_coordinates, \
482
+ in_color, 'None', 3)
483
+ else:
484
+ ppocr_image_detect = ppocr_status
485
+ ##
486
+
487
+ ## ------- MMOCR Text detection
488
+ #with st.spinner('MMOCR Text detection in progress ...'):
489
+ # mmocr_boxes_coordinates, mmocr_status = mmocr_detect(_in_list_readers[2], in_image_path)
490
+ # # Visualization
491
+ # if mmocr_boxes_coordinates:
492
+ # mmocr_image_detect = draw_detected(_in_list_images[0], mmocr_boxes_coordinates, \
493
+ # in_color, 'None', 3)
494
+ # else:
495
+ # mmocr_image_detect = mmocr_status
496
+ ##
497
+
498
+ ## ------- Tesseract Text detection
499
+ with st.spinner('Tesseract Text detection in progress ...'):
500
+ tesserocr_boxes_coordinates, tesserocr_status = tesserocr_detect(in_image_path, \
501
+ _in_list_images[0], \
502
+ in_list_params[2]) #in_list_params[3]
503
+ # Visualization
504
+ if tesserocr_status == 'OK':
505
+ tesserocr_image_detect = draw_detected(_in_list_images[0],tesserocr_boxes_coordinates,\
506
+ in_color, 'None', 3)
507
+ else:
508
+ tesserocr_image_detect = tesserocr_status
509
+ ##
510
+ #
511
+ out_list_images = _in_list_images + [easyocr_image_detect, ppocr_image_detect, \
512
+ # mmocr_image_detect, \
513
+ tesserocr_image_detect]
514
+ out_list_coordinates = [easyocr_boxes_coordinates, ppocr_boxes_coordinates, \
515
+ # mmocr_boxes_coordinates, \
516
+ tesserocr_boxes_coordinates]
517
+ #
518
+
519
+ return out_list_images, out_list_coordinates
520
+
521
+ ###
522
+ def draw_detected(in_image, in_boxes_coordinates, in_color, posit='None', in_thickness=4):
523
+ """Draw boxes around detected text
524
+
525
+ Args:
526
+ in_image (PIL.Image) : original image
527
+ in_boxes_coordinates (list) : boxes coordinates, from top to bottom and from left to right
528
+ [ [ [x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max] ],
529
+ [ ... ]
530
+ ]
531
+ in_color (tuple) : color for boxes around text
532
+ posit (str, optional) : position for text. Defaults to 'None'.
533
+ in_thickness (int, optional): thickness of the box. Defaults to 4.
534
+
535
+ Returns:
536
+ PIL.Image : original image with detected areas
537
+ """
538
+ work_img = in_image.copy()
539
+ if in_boxes_coordinates:
540
+ font = cv2.FONT_HERSHEY_SIMPLEX
541
+ for ind_box, box in enumerate(in_boxes_coordinates):
542
+ box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
543
+ work_img = cv2.polylines(np.array(work_img), [box], True, in_color, in_thickness)
544
+ if posit != 'None':
545
+ if posit == 'top_left':
546
+ pos = tuple(box[0][0])
547
+ elif posit == 'top_right':
548
+ pos = tuple(box[1][0])
549
+ work_img = cv2.putText(work_img, str(ind_box+1), pos, font, 5.5, color, \
550
+ in_thickness,cv2.LINE_AA)
551
+
552
+ out_image_drawn = Image.fromarray(work_img)
553
+ else:
554
+ out_image_drawn = work_img
555
+
556
+ return out_image_drawn
557
+
558
+ ###
559
+ @st.experimental_memo(show_spinner=False)
560
+ def get_cropped(in_boxes_coordinates, in_image_cv):
561
+ """Construct list of cropped images corresponding of the input boxes coordinates list
562
+
563
+ Args:
564
+ in_boxes_coordinates (list) : list of boxes coordinates
565
+ in_image_cv (matrix) : original image
566
+
567
+ Returns:
568
+ list : list with cropped images
569
+ """
570
+ out_list_images = []
571
+ for box in in_boxes_coordinates:
572
+ cropped = cropped_1box(box, in_image_cv)
573
+ out_list_images.append(cropped)
574
+ return out_list_images
575
+
576
+ ###
577
+ def process_recog(in_list_readers, in_image_cv, in_boxes_coordinates, in_list_dict_params):
578
+ """Recognition process for each OCR solution
579
+
580
+ Args:
581
+ in_list_readers (list) : list with previously initialized reader's instances
582
+ in_image_cv (matrix) : original image
583
+ in_boxes_coordinates (list) : list of boxes coordinates
584
+ in_list_dict_params (list) : list with dict parameters for each OCR solution
585
+
586
+ Returns:
587
+ data frame : results for each OCR solution, except Tesseract
588
+ data frame : results for Tesseract
589
+ list : status for each recognition (exception or 'OK')
590
+ """
591
+ out_df_results = pd.DataFrame([])
592
+
593
+ list_text_easyocr = []
594
+ list_confidence_easyocr = []
595
+ list_text_ppocr = []
596
+ list_confidence_ppocr = []
597
+ #list_text_mmocr = []
598
+ #list_confidence_mmocr = []
599
+
600
+ # Create cropped images from detection
601
+ list_cropped_images = get_cropped(in_boxes_coordinates, in_image_cv)
602
+
603
+ # Recognize with EasyOCR
604
+ with st.spinner('EasyOCR Text recognition in progress ...'):
605
+ list_text_easyocr, list_confidence_easyocr, status_easyocr = \
606
+ easyocr_recog(list_cropped_images, in_list_readers[0], in_list_dict_params[0])
607
+ ##
608
+
609
+ # Recognize with PPOCR
610
+ with st.spinner('PPOCR Text recognition in progress ...'):
611
+ list_text_ppocr, list_confidence_ppocr, status_ppocr = \
612
+ ppocr_recog(list_cropped_images, in_list_dict_params[1])
613
+ ##
614
+
615
+ # Recognize with MMOCR
616
+ #with st.spinner('MMOCR Text recognition in progress ...'):
617
+ # list_text_mmocr, list_confidence_mmocr, status_mmocr = \
618
+ # mmocr_recog(list_cropped_images, in_list_dict_params[2])
619
+ ##
620
+
621
+ # Recognize with Tesseract
622
+ with st.spinner('Tesseract Text recognition in progress ...'):
623
+ out_df_results_tesseract, status_tesseract = \
624
+ tesserocr_recog(in_image_cv, in_list_dict_params[2], len(list_cropped_images))
625
+ #tesserocr_recog(in_image_cv, in_list_dict_params[3], len(list_cropped_images))
626
+ ##
627
+
628
+ # Create results data frame
629
+ out_df_results = pd.DataFrame({'cropped_image': list_cropped_images,
630
+ 'text_easyocr': list_text_easyocr,
631
+ 'confidence_easyocr': list_confidence_easyocr,
632
+ 'text_ppocr': list_text_ppocr,
633
+ 'confidence_ppocr': list_confidence_ppocr,
634
+ #'text_mmocr': list_text_mmocr,
635
+ #'confidence_mmocr': list_confidence_mmocr
636
+ }
637
+ )
638
+
639
+ #out_list_reco_status = [status_easyocr, status_ppocr, status_mmocr, status_tesseract]
640
+ out_list_reco_status = [status_easyocr, status_ppocr, status_tesseract]
641
+
642
+ return out_df_results, out_df_results_tesseract, out_list_reco_status
643
+
644
+ ###
645
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
646
+ def easyocr_recog(in_list_images, _in_reader_easyocr, in_params):
647
+ """Recognition with EasyOCR
648
+
649
+ Args:
650
+ in_list_images (list) : list of cropped images
651
+ _in_reader_easyocr (EasyOCR reader) : the previously initialized instance
652
+ in_params (dict) : parameters for recognition
653
+
654
+ Returns:
655
+ list : list of recognized text
656
+ list : list of recognition confidence
657
+ string/Exception : recognition status
658
+ """
659
+ progress_bar = st.progress(0)
660
+ out_list_text_easyocr = []
661
+ out_list_confidence_easyocr = []
662
+ ## ------- EasyOCR Text recognition
663
+ try:
664
+ step = 0*len(in_list_images) # first recognition process
665
+ #nb_steps = 4 * len(in_list_images)
666
+ nb_steps = 3 * len(in_list_images)
667
+ for ind_img, cropped in enumerate(in_list_images):
668
+ result = _in_reader_easyocr.recognize(cropped, **in_params)
669
+ try:
670
+ out_list_text_easyocr.append(result[0][1])
671
+ out_list_confidence_easyocr.append(np.round(100*result[0][2], 1))
672
+ except:
673
+ out_list_text_easyocr.append('Not recognize')
674
+ out_list_confidence_easyocr.append(100.)
675
+ progress_bar.progress((step+ind_img+1)/nb_steps)
676
+ out_status = 'OK'
677
+ except Exception as e:
678
+ out_status = e
679
+ progress_bar.empty()
680
+
681
+ return out_list_text_easyocr, out_list_confidence_easyocr, out_status
682
+
683
+ ###
684
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
685
+ def ppocr_recog(in_list_images, in_params):
686
+ """Recognition with PPOCR
687
+
688
+ Args:
689
+ in_list_images (list) : list of cropped images
690
+ in_params (dict) : parameters for recognition
691
+
692
+ Returns:
693
+ list : list of recognized text
694
+ list : list of recognition confidence
695
+ string/Exception : recognition status
696
+ """
697
+ ## ------- PPOCR Text recognition
698
+ out_list_text_ppocr = []
699
+ out_list_confidence_ppocr = []
700
+ try:
701
+ reader_ppocr = PaddleOCR(**in_params)
702
+ step = 1*len(in_list_images) # second recognition process
703
+ #nb_steps = 4 * len(in_list_images)
704
+ nb_steps = 3 * len(in_list_images)
705
+ progress_bar = st.progress(step/nb_steps)
706
+
707
+ for ind_img, cropped in enumerate(in_list_images):
708
+ list_result = reader_ppocr.ocr(cropped, det=False, cls=False)
709
+ result = list_result[0]
710
+ try:
711
+ out_list_text_ppocr.append(result[0][0])
712
+ out_list_confidence_ppocr.append(np.round(100*result[0][1], 1))
713
+ except:
714
+ out_list_text_ppocr.append('Not recognize')
715
+ out_list_confidence_ppocr.append(100.)
716
+ progress_bar.progress((step+ind_img+1)/nb_steps)
717
+ out_status = 'OK'
718
+ except Exception as e:
719
+ out_status = e
720
+ progress_bar.empty()
721
+
722
+ return out_list_text_ppocr, out_list_confidence_ppocr, out_status
723
+
724
+ ###
725
+ #@st.experimental_memo(suppress_st_warning=True, show_spinner=False)
726
+ #def mmocr_recog(in_list_images, in_params):
727
+ # """Recognition with MMOCR
728
+ #
729
+ # Args:
730
+ # in_list_images (list) : list of cropped images
731
+ # in_params (dict) : parameters for recognition
732
+ #
733
+ # Returns:
734
+ # list : list of recognized text
735
+ # list : list of recognition confidence
736
+ # string/Exception : recognition status
737
+ # """
738
+ # ## ------- MMOCR Text recognition
739
+ # out_list_text_mmocr = []
740
+ # out_list_confidence_mmocr = []
741
+ # try:
742
+ # reader_mmocr = MMOCR(det=None, **in_params)
743
+ # step = 2*len(in_list_images) # third recognition process
744
+ # nb_steps = 4 * len(in_list_images)
745
+ # progress_bar = st.progress(step/nb_steps)
746
+ #
747
+ # for ind_img, cropped in enumerate(in_list_images):
748
+ # result = reader_mmocr.readtext(cropped, details=True)
749
+ # try:
750
+ # out_list_text_mmocr.append(result[0]['text'])
751
+ # out_list_confidence_mmocr.append(np.round(100* \
752
+ # (np.array(result[0]['score']).mean()), 1))
753
+ # except:
754
+ # out_list_text_mmocr.append('Not recognize')
755
+ # out_list_confidence_mmocr.append(100.)
756
+ # progress_bar.progress((step+ind_img+1)/nb_steps)
757
+ # out_status = 'OK'
758
+ # except Exception as e:
759
+ # out_status = e
760
+ # progress_bar.empty()
761
+ #
762
+ # return out_list_text_mmocr, out_list_confidence_mmocr, out_status
763
+ #
764
+ ###
765
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
766
+ def tesserocr_recog(in_img, in_params, in_nb_images):
767
+ """Recognition with Tesseract
768
+
769
+ Args:
770
+ in_image_cv (matrix) : original image
771
+ in_params (dict) : parameters for recognition
772
+ in_nb_images : nb cropped images (used for progress bar)
773
+
774
+ Returns:
775
+ Pandas data frame : recognition results
776
+ string/Exception : recognition status
777
+ """
778
+ ## ------- Tesseract Text recognition
779
+ step = 3*in_nb_images # fourth recognition process
780
+ #nb_steps = 4 * in_nb_images
781
+ nb_steps = 3 * in_nb_images
782
+ progress_bar = st.progress(step/nb_steps)
783
+
784
+ try:
785
+ out_df_result = pytesseract.image_to_data(in_img, **in_params,output_type=Output.DATAFRAME)
786
+
787
+ out_df_result['box'] = out_df_result.apply(lambda d: [[d['left'], d['top']], \
788
+ [d['left'] + d['width'], d['top']], \
789
+ [d['left']+d['width'], d['top']+d['height']], \
790
+ [d['left'], d['top'] + d['height']], \
791
+ ], axis=1)
792
+ out_df_result['cropped'] = out_df_result['box'].apply(lambda b: cropped_1box(b, in_img))
793
+ out_df_result = out_df_result[(out_df_result.word_num > 0) & (out_df_result.text != ' ')] \
794
+ .reset_index(drop=True)
795
+ out_status = 'OK'
796
+ except Exception as e:
797
+ out_df_result = pd.DataFrame([])
798
+ out_status = e
799
+
800
+ progress_bar.progress(1.)
801
+
802
+ return out_df_result, out_status
803
+
804
+ ###
805
+ def draw_reco_images(in_image, in_boxes_coordinates, in_list_texts, in_list_confid, \
806
+ in_dict_back_colors, in_df_results_tesseract, in_reader_type_list, \
807
+ in_font_scale=1, in_conf_threshold=65):
808
+ """Draw recognized text on original image, for each OCR solution used
809
+
810
+ Args:
811
+ in_image (matrix) : original image
812
+ in_boxes_coordinates (list) : list of boxes coordinates
813
+ in_list_texts (list): list of recognized text for each recognizer (except Tesseract)
814
+ in_list_confid (list): list of recognition confidence for each recognizer (except Tesseract)
815
+ in_df_results_tesseract (Pandas data frame): Tesseract recognition results
816
+ in_font_scale (int, optional): text font scale. Defaults to 3.
817
+
818
+ Returns:
819
+ shows the results container
820
+ """
821
+ img = in_image.copy()
822
+ nb_readers = len(in_reader_type_list)
823
+ list_reco_images = [img.copy() for i in range(nb_readers)]
824
+
825
+ for num, box_ in enumerate(in_boxes_coordinates):
826
+ box = np.array(box_).astype(np.int64)
827
+
828
+ # For each box : draw the results of each recognizer
829
+ for ind_r in range(nb_readers-1):
830
+ confid = np.round(in_list_confid[ind_r][num], 0)
831
+ rgb_color = ImageColor.getcolor(in_dict_back_colors[confid], "RGB")
832
+ if confid < in_conf_threshold:
833
+ text_color = (0, 0, 0)
834
+ else:
835
+ text_color = (255, 255, 255)
836
+
837
+ list_reco_images[ind_r] = cv2.rectangle(list_reco_images[ind_r], \
838
+ (box[0][0], box[0][1]), \
839
+ (box[2][0], box[2][1]), rgb_color, -1)
840
+ list_reco_images[ind_r] = cv2.putText(list_reco_images[ind_r], \
841
+ in_list_texts[ind_r][num], \
842
+ (box[0][0],int(np.round((box[0][1]+box[2][1])/2,0))), \
843
+ cv2.FONT_HERSHEY_DUPLEX, in_font_scale, text_color, 2)
844
+
845
+ # Add Tesseract process
846
+ if not in_df_results_tesseract.empty:
847
+ ind_tessocr = nb_readers-1
848
+ for num, box_ in enumerate(in_df_results_tesseract['box'].to_list()):
849
+ box = np.array(box_).astype(np.int64)
850
+ confid = np.round(in_df_results_tesseract.iloc[num]['conf'], 0)
851
+ rgb_color = ImageColor.getcolor(in_dict_back_colors[confid], "RGB")
852
+ if confid < in_conf_threshold:
853
+ text_color = (0, 0, 0)
854
+ else:
855
+ text_color = (255, 255, 255)
856
+
857
+ list_reco_images[ind_tessocr] = \
858
+ cv2.rectangle(list_reco_images[ind_tessocr], (box[0][0], box[0][1]), \
859
+ (box[2][0], box[2][1]), rgb_color, -1)
860
+ try:
861
+ list_reco_images[ind_tessocr] = \
862
+ cv2.putText(list_reco_images[ind_tessocr], \
863
+ in_df_results_tesseract.iloc[num]['text'], \
864
+ (box[0][0],int(np.round((box[0][1]+box[2][1])/2,0))), \
865
+ cv2.FONT_HERSHEY_DUPLEX, in_font_scale, text_color, 2)
866
+
867
+ except:
868
+
869
+ pass
870
+
871
+ with show_reco.container():
872
+ # Draw the results, 2 images per line
873
+ reco_lines = math.ceil(len(in_reader_type_list) / 2)
874
+ column_width = 400
875
+ for ind_lig in range(0, reco_lines+1, 2):
876
+ cols = st.columns(2)
877
+ for ind_col in range(2):
878
+ ind = ind_lig + ind_col
879
+ if ind < len(in_reader_type_list):
880
+ if in_reader_type_list[ind] == 'Tesseract':
881
+ column_title = '<p style="font-size: 20px;color:rgb(228,26,28); \
882
+ ">Recognition with ' + in_reader_type_list[ind] + \
883
+ '<sp style="font-size: 17px"> (with its own detector) \
884
+ </sp></p>'
885
+ else:
886
+ column_title = '<p style="font-size: 20px;color:rgb(228,26,28); \
887
+ ">Recognition with ' + \
888
+ in_reader_type_list[ind]+ '</p>'
889
+ cols[ind_col].markdown(column_title, unsafe_allow_html=True)
890
+ if st.session_state.list_reco_status[ind] == 'OK':
891
+ cols[ind_col].image(list_reco_images[ind], \
892
+ width=column_width, use_column_width=True)
893
+ else:
894
+ cols[ind_col].write(list_reco_status[ind], \
895
+ use_column_width=True)
896
+
897
+ st.markdown(' 💡 Bad font size? you can adjust it below and refresh:')
898
+
899
+ ###
900
+ def highlight():
901
+ """ Highlight choosen detector results
902
+ """
903
+ with show_detect.container():
904
+ columns_size = [1 for x in reader_type_list]
905
+ column_width = [300 for x in reader_type_list]
906
+ columns_color = ["rgb(12, 5, 105)" for x in reader_type_list]
907
+ columns_size[reader_type_dict[st.session_state.detect_reader]] = 2
908
+ column_width[reader_type_dict[st.session_state.detect_reader]] = 400
909
+ columns_color[reader_type_dict[st.session_state.detect_reader]] = "rgb(228,26,28)"
910
+ columns = st.columns(columns_size, ) #gap='medium')
911
+
912
+ for ind_col, col in enumerate(columns):
913
+ column_title = '<p style="font-size: 20px;color:'+columns_color[ind_col] + \
914
+ ';">Detection with ' + reader_type_list[ind_col]+ '</p>'
915
+ col.markdown(column_title, unsafe_allow_html=True)
916
+ if isinstance(list_images[ind_col+2], PIL.Image.Image):
917
+ col.image(list_images[ind_col+2], width=column_width[ind_col], \
918
+ use_column_width=True)
919
+ else:
920
+ col.write(list_images[ind_col+2], use_column_width=True)
921
+ st.session_state.columns_size = columns_size
922
+ st.session_state.column_width = column_width
923
+ st.session_state.columns_color = columns_color
924
+
925
+ ###
926
+ @st.cache(show_spinner=False)
927
+ def get_demo():
928
+ """Get the demo files
929
+
930
+ Returns:
931
+ PIL.Image : input file opened with Pillow
932
+ PIL.Image : input file opened with Pillow
933
+ """
934
+
935
+ out_img_demo_1 = Image.open("img_demo_1.jpg")
936
+ out_img_demo_2 = Image.open("img_demo_2.jpg")
937
+
938
+ return out_img_demo_1, out_img_demo_2
939
+
940
+ ###
941
+ def raz():
942
+ st.session_state.list_coordinates = []
943
+ st.session_state.list_images = []
944
+ st.session_state.detect_reader = reader_type_list[0]
945
+
946
+ st.session_state.columns_size = [2] + [1 for x in reader_type_list[1:]]
947
+ st.session_state.column_width = [400] + [300 for x in reader_type_list[1:]]
948
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
949
+ ["rgb(79, 43, 255)" for x in reader_type_list[1:]]
950
+
951
+ # Clear caches
952
+ easyocr_detect.clear()
953
+ ppocr_detect.clear()
954
+ #mmocr_detect.clear()
955
+ tesserocr_detect.clear()
956
+ process_detect.clear()
957
+ get_cropped.clear()
958
+ easyocr_recog.clear()
959
+ ppocr_recog.clear()
960
+ #mmocr_recog.clear()
961
+ tesserocr_recog.clear()
962
+
963
+
964
+ ##----------- Initializations ---------------------------------------------------------------------
965
+ #print("PID : ", os.getpid())
966
+
967
+ st.title("OCR solutions comparator")
968
+ #st.markdown("##### *EasyOCR, PPOCR, Tesseract*")
969
+ st.markdown("##### *EasyOCR, PPOCR, MMOCR, Tesseract*")
970
+ #st.markdown("#### PID : " + str(os.getpid()))
971
+
972
+ # Initializations
973
+ with st.spinner("Initializations in progress ..."):
974
+ reader_type_list, reader_type_dict, list_dict_lang, \
975
+ cols_size, dict_back_colors, fig_colorscale = initializations()
976
+ img_demo_1, img_demo_2 = get_demo()
977
+
978
+ ##----------- Choose language & image -------------------------------------------------------------
979
+ st.markdown("#### Choose languages for the text recognition:")
980
+ lang_col = st.columns(4)
981
+ easyocr_key_lang = lang_col[0].selectbox(reader_type_list[0]+" :", list_dict_lang[0].keys(), 26)
982
+ easyocr_lang = list_dict_lang[0][easyocr_key_lang]
983
+ ppocr_key_lang = lang_col[1].selectbox(reader_type_list[1]+" :", list_dict_lang[1].keys(), 22)
984
+ ppocr_lang = list_dict_lang[1][ppocr_key_lang]
985
+ #mmocr_key_lang = lang_col[2].selectbox(reader_type_list[2]+" :", list_dict_lang[2].keys(), 0)
986
+ #mmocr_lang = list_dict_lang[2][mmocr_key_lang]
987
+ #tesserocr_key_lang = lang_col[3].selectbox(reader_type_list[3]+" :", list_dict_lang[3].keys(), 35)
988
+ #tesserocr_lang = list_dict_lang[3][tesserocr_key_lang]
989
+ tesserocr_key_lang = lang_col[2].selectbox(reader_type_list[2]+" :", list_dict_lang[2].keys(), 35)
990
+ tesserocr_lang = list_dict_lang[2][tesserocr_key_lang]
991
+
992
+ st.markdown("#### Choose picture:")
993
+ cols_pict = st.columns([1, 2])
994
+ img_typ = cols_pict[0].radio("", ['Upload file', 'Take a picture', 'Use a demo file'], \
995
+ index=0, on_change=raz)
996
+
997
+ if img_typ == 'Upload file':
998
+ image_file = cols_pict[1].file_uploader("Upload a file:", type=["jpg","jpeg"], on_change=raz)
999
+ if img_typ == 'Take a picture':
1000
+ image_file = cols_pict[1].camera_input("Take a picture:", on_change=raz)
1001
+ if img_typ == 'Use a demo file':
1002
+ with st.expander('Choose a demo file:', expanded=True):
1003
+ demo_used = st.radio('', ['File 1', 'File 2'], index=0, \
1004
+ horizontal=True, on_change=raz)
1005
+ cols_demo = st.columns([1, 2])
1006
+ cols_demo[0].markdown('###### File 1')
1007
+ cols_demo[0].image(img_demo_1, width=150)
1008
+ cols_demo[1].markdown('###### File 2')
1009
+ cols_demo[1].image(img_demo_2, width=300)
1010
+ if demo_used == 'File 1':
1011
+ image_file = 'img_demo_1.jpg'
1012
+ else:
1013
+ image_file = 'img_demo_2.jpg'
1014
+
1015
+ ##----------- Process input image -----------------------------------------------------------------
1016
+ if image_file is not None:
1017
+ image_path, image_orig, image_cv2 = load_image(image_file)
1018
+ list_images = [image_orig, image_cv2]
1019
+
1020
+ ##----------- Form with original image & hyperparameters for detectors ----------------------------
1021
+ with st.form("form1"):
1022
+ col1, col2 = st.columns(2, ) #gap="medium")
1023
+ col1.markdown("##### Original image")
1024
+ col1.image(list_images[0], width=400)
1025
+ col2.markdown("##### Hyperparameters values for detection")
1026
+
1027
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[0], \
1028
+ expanded=False):
1029
+ t0_min_size = st.slider("min_size", 1, 20, 10, step=1, \
1030
+ help="min_size (int, default = 10) - Filter text box smaller than \
1031
+ minimum value in pixel")
1032
+ t0_text_threshold = st.slider("text_threshold", 0.1, 1., 0.7, step=0.1, \
1033
+ help="text_threshold (float, default = 0.7) - Text confidence threshold")
1034
+ t0_low_text = st.slider("low_text", 0.1, 1., 0.4, step=0.1, \
1035
+ help="low_text (float, default = 0.4) - Text low-bound score")
1036
+ t0_link_threshold = st.slider("link_threshold", 0.1, 1., 0.4, step=0.1, \
1037
+ help="link_threshold (float, default = 0.4) - Link confidence threshold")
1038
+ t0_canvas_size = st.slider("canvas_size", 2000, 5000, 2560, step=10, \
1039
+ help='''canvas_size (int, default = 2560) \n
1040
+ Maximum e size. Image bigger than this value will be resized down''')
1041
+ t0_mag_ratio = st.slider("mag_ratio", 0.1, 5., 1., step=0.1, \
1042
+ help="mag_ratio (float, default = 1) - Image magnification ratio")
1043
+ t0_slope_ths = st.slider("slope_ths", 0.01, 1., 0.1, step=0.01, \
1044
+ help='''slope_ths (float, default = 0.1) - Maximum slope \
1045
+ (delta y/delta x) to considered merging. \n
1046
+ Low valuans tiled boxes will not be merged.''')
1047
+ t0_ycenter_ths = st.slider("ycenter_ths", 0.1, 1., 0.5, step=0.1, \
1048
+ help='''ycenter_ths (float, default = 0.5) - Maximum shift in y direction. \n
1049
+ Boxes wiifferent level should not be merged.''')
1050
+ t0_height_ths = st.slider("height_ths", 0.1, 1., 0.5, step=0.1, \
1051
+ help='''height_ths (float, default = 0.5) - Maximum different in box height. \n
1052
+ Boxes wiery different text size should not be merged.''')
1053
+ t0_width_ths = st.slider("width_ths", 0.1, 1., 0.5, step=0.1, \
1054
+ help="width_ths (float, default = 0.5) - Maximum horizontal \
1055
+ distance to merge boxes.")
1056
+ t0_add_margin = st.slider("add_margin", 0.1, 1., 0.1, step=0.1, \
1057
+ help='''add_margin (float, default = 0.1) - \
1058
+ Extend bounding boxes in all direction by certain value. \n
1059
+ This is rtant for language with complex script (E.g. Thai).''')
1060
+ t0_optimal_num_chars = st.slider("optimal_num_chars", None, 100, None, step=10, \
1061
+ help="optimal_num_chars (int, default = None) - If specified, bounding boxes \
1062
+ with estimated number of characters near this value are returned first.")
1063
+
1064
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[1], \
1065
+ expanded=False):
1066
+ t1_det_algorithm = st.selectbox('det_algorithm', ['DB'], \
1067
+ help='Type of detection algorithm selected. (default = DB)')
1068
+ t1_det_max_side_len = st.slider('det_max_side_len', 500, 2000, 960, step=10, \
1069
+ help='''The maximum size of the long side of the image. (default = 960)\n
1070
+ Limit thximum image height and width.\n
1071
+ When theg side exceeds this value, the long side will be resized to this size, and the short side \
1072
+ will be ed proportionally.''')
1073
+ t1_det_db_thresh = st.slider('det_db_thresh', 0.1, 1., 0.3, step=0.1, \
1074
+ help='''Binarization threshold value of DB output map. (default = 0.3) \n
1075
+ Used to er the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result.''')
1076
+ t1_det_db_box_thresh = st.slider('det_db_box_thresh', 0.1, 1., 0.6, step=0.1, \
1077
+ help='''The threshold value of the DB output box. (default = 0.6) \n
1078
+ DB post-essing filter box threshold, if there is a missing box detected, it can be reduced as appropriate. \n
1079
+ Boxes sclower than this value will be discard.''')
1080
+ t1_det_db_unclip_ratio = st.slider('det_db_unclip_ratio', 1., 3.0, 1.6, step=0.1, \
1081
+ help='''The expanded ratio of DB output box. (default = 1.6) \n
1082
+ Indicatee compactness of the text box, the smaller the value, the closer the text box to the text.''')
1083
+ t1_det_east_score_thresh = st.slider('det_east_cover_thresh', 0.1, 1., 0.8, step=0.1, \
1084
+ help="Binarization threshold value of EAST output map. (default = 0.8)")
1085
+ t1_det_east_cover_thresh = st.slider('det_east_cover_thresh', 0.1, 1., 0.1, step=0.1, \
1086
+ help='''The threshold value of the EAST output box. (default = 0.1) \n
1087
+ Boxes sclower than this value will be discarded.''')
1088
+ t1_det_east_nms_thresh = st.slider('det_east_nms_thresh', 0.1, 1., 0.2, step=0.1, \
1089
+ help="The NMS threshold value of EAST model output box. (default = 0.2)")
1090
+ t1_det_db_score_mode = st.selectbox('det_db_score_mode', ['fast', 'slow'], \
1091
+ help='''slow: use polygon box to calculate bbox score, fast: use rectangle box \
1092
+ to calculate. (default = fast) \n
1093
+ Use rectlar box to calculate faster, and polygonal box more accurate for curved text area.''')
1094
+ """
1095
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[2], \
1096
+ expanded=False):
1097
+ t2_det = st.selectbox('det', ['DB_r18','DB_r50','DBPP_r50','DRRG','FCE_IC15', \
1098
+ 'FCE_CTW_DCNv2','MaskRCNN_CTW','MaskRCNN_IC15', \
1099
+ 'MaskRCNN_IC17', 'PANet_CTW','PANet_IC15','PS_CTW',\
1100
+ 'PS_IC15','Tesseract','TextSnake'], 10, \
1101
+ help='Text detection algorithm. (default = PANet_IC15)')
1102
+ st.write("###### *More about text detection models* 👉 \
1103
+ [here](https://mmocr.readthedocs.io/en/latest/textdet_models.html)")
1104
+ t2_merge_xdist = st.slider('merge_xdist', 1, 50, 20, step=1, \
1105
+ help='The maximum x-axis distance to merge boxes. (defaut=20)')
1106
+ """
1107
+ #with col2.expander("Choose detection hyperparameters for " + reader_type_list[3], \
1108
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[2], \
1109
+ expanded=False):
1110
+ t3_psm = st.selectbox('Page segmentation mode (psm)', \
1111
+ [' - Default', \
1112
+ ' 4 Assume a single column of text of variable sizes', \
1113
+ ' 5 Assume a single uniform block of vertically aligned text', \
1114
+ ' 6 Assume a single uniform block of text', \
1115
+ ' 7 Treat the image as a single text line', \
1116
+ ' 8 Treat the image as a single word', \
1117
+ ' 9 Treat the image as a single word in a circle', \
1118
+ '10 Treat the image as a single character', \
1119
+ '11 Sparse text. Find as much text as possible in no \
1120
+ particular order', \
1121
+ '13 Raw line. Treat the image as a single text line, \
1122
+ bypassing hacks that are Tesseract-specific'])
1123
+ t3_oem = st.selectbox('OCR engine mode', ['0 Legacy engine only', \
1124
+ '1 Neural nets LSTM engine only', \
1125
+ '2 Legacy + LSTM engines', \
1126
+ '3 Default, based on what is available'], 3)
1127
+ t3_whitelist = st.text_input('Limit tesseract to recognize only this characters :', \
1128
+ placeholder='Limit tesseract to recognize only this characters', \
1129
+ help='Example for numbers only : 0123456789')
1130
+
1131
+ color_hex = col2.color_picker('Set a color for box outlines:', '#004C99')
1132
+ color_part = color_hex.lstrip('#')
1133
+ color = tuple(int(color_part[i:i+2], 16) for i in (0, 2, 4))
1134
+
1135
+ submit_detect = st.form_submit_button("Launch detection")
1136
+
1137
+ ##----------- Process text detection --------------------------------------------------------------
1138
+ if submit_detect:
1139
+ # Process text detection
1140
+
1141
+ if t0_optimal_num_chars == 0:
1142
+ t0_optimal_num_chars = None
1143
+
1144
+ # Construct the config Tesseract parameter
1145
+ t3_config = ''
1146
+ psm = t3_psm[:2]
1147
+ if psm != ' -':
1148
+ t3_config += '--psm ' + psm.strip()
1149
+ oem = t3_oem[:1]
1150
+ if oem != '3':
1151
+ t3_config += ' --oem ' + oem
1152
+ if t3_whitelist != '':
1153
+ t3_config += ' -c tessedit_char_whitelist=' + t3_whitelist
1154
+
1155
+ list_params_det = \
1156
+ [[easyocr_lang, \
1157
+ {'min_size': t0_min_size, 'text_threshold': t0_text_threshold, \
1158
+ 'low_text': t0_low_text, 'link_threshold': t0_link_threshold, \
1159
+ 'canvas_size': t0_canvas_size, 'mag_ratio': t0_mag_ratio, \
1160
+ 'slope_ths': t0_slope_ths, 'ycenter_ths': t0_ycenter_ths, \
1161
+ 'height_ths': t0_height_ths, 'width_ths': t0_width_ths, \
1162
+ 'add_margin': t0_add_margin, 'optimal_num_chars': t0_optimal_num_chars \
1163
+ }], \
1164
+ [ppocr_lang, \
1165
+ {'det_algorithm': t1_det_algorithm, 'det_max_side_len': t1_det_max_side_len, \
1166
+ 'det_db_thresh': t1_det_db_thresh, 'det_db_box_thresh': t1_det_db_box_thresh, \
1167
+ 'det_db_unclip_ratio': t1_det_db_unclip_ratio, \
1168
+ 'det_east_score_thresh': t1_det_east_score_thresh, \
1169
+ 'det_east_cover_thresh': t1_det_east_cover_thresh, \
1170
+ 'det_east_nms_thresh': t1_det_east_nms_thresh, \
1171
+ 'det_db_score_mode': t1_det_db_score_mode}],
1172
+ #[mmocr_lang, {'det': t2_det, 'merge_xdist': t2_merge_xdist}],
1173
+ [tesserocr_lang, {'lang': tesserocr_lang, 'config': t3_config}]
1174
+ ]
1175
+
1176
+ show_info1 = st.empty()
1177
+ show_info1.info("Readers initializations in progress (it may take a while) ...")
1178
+ list_readers = init_readers(list_params_det)
1179
+
1180
+ show_info1.info("Text detection in progress ...")
1181
+ list_images, list_coordinates = process_detect(image_path, list_images, list_readers, \
1182
+ list_params_det, color)
1183
+ show_info1.empty()
1184
+
1185
+ # Clear previous recognition results
1186
+ st.session_state.df_results = pd.DataFrame([])
1187
+
1188
+ st.session_state.list_readers = list_readers
1189
+ st.session_state.list_coordinates = list_coordinates
1190
+ st.session_state.list_images = list_images
1191
+ st.session_state.list_params_det = list_params_det
1192
+
1193
+ if 'columns_size' not in st.session_state:
1194
+ st.session_state.columns_size = [2] + [1 for x in reader_type_list[1:]]
1195
+ if 'column_width' not in st.session_state:
1196
+ st.session_state.column_width = [400] + [300 for x in reader_type_list[1:]]
1197
+ if 'columns_color' not in st.session_state:
1198
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
1199
+ ["rgb(79, 43, 255)" for x in reader_type_list[1:]]
1200
+
1201
+ if st.session_state.list_coordinates:
1202
+ list_coordinates = st.session_state.list_coordinates
1203
+ list_images = st.session_state.list_images
1204
+ list_readers = st.session_state.list_readers
1205
+ list_params_det = st.session_state.list_params_det
1206
+
1207
+ ##----------- Text detection results --------------------------------------------------------------
1208
+ st.subheader("Text detection")
1209
+ show_detect = st.empty()
1210
+ list_ok_detect = []
1211
+ with show_detect.container():
1212
+ columns = st.columns(st.session_state.columns_size, ) #gap='medium')
1213
+ for no_col, col in enumerate(columns):
1214
+ column_title = '<p style="font-size: 20px;color:' + \
1215
+ st.session_state.columns_color[no_col] + \
1216
+ ';">Detection with ' + reader_type_list[no_col]+ '</p>'
1217
+ col.markdown(column_title, unsafe_allow_html=True)
1218
+ if isinstance(list_images[no_col+2], PIL.Image.Image):
1219
+ col.image(list_images[no_col+2], width=st.session_state.column_width[no_col], \
1220
+ use_column_width=True)
1221
+ list_ok_detect.append(reader_type_list[no_col])
1222
+ else:
1223
+ col.write(list_images[no_col+2], use_column_width=True)
1224
+
1225
+ st.subheader("Text recognition")
1226
+
1227
+ st.markdown("##### Using detection performed above by:")
1228
+ st.radio('Choose the detecter:', list_ok_detect, key='detect_reader', \
1229
+ horizontal=True, on_change=highlight)
1230
+
1231
+ ##----------- Form with hyperparameters for recognition -----------------------
1232
+ st.markdown("##### Hyperparameters values for recognition:")
1233
+ with st.form("form2"):
1234
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[0], \
1235
+ expanded=False):
1236
+ t0_decoder = st.selectbox('decoder', ['greedy', 'beamsearch', 'wordbeamsearch'], \
1237
+ help="decoder (string, default = 'greedy') - options are 'greedy', \
1238
+ 'beamsearch' and 'wordbeamsearch.")
1239
+ t0_beamWidth = st.slider('beamWidth', 2, 20, 5, step=1, \
1240
+ help="beamWidth (int, default = 5) - How many beam to keep when decoder = \
1241
+ 'beamsearch' or 'wordbeamsearch'.")
1242
+ t0_batch_size = st.slider('batch_size', 1, 10, 1, step=1, \
1243
+ help="batch_size (int, default = 1) - batch_size>1 will make EasyOCR faster \
1244
+ but use more memory.")
1245
+ t0_workers = st.slider('workers', 0, 10, 0, step=1, \
1246
+ help="workers (int, default = 0) - Number thread used in of dataloader.")
1247
+ t0_allowlist = st.text_input('allowlist', value="", max_chars=None, \
1248
+ placeholder='Force EasyOCR to recognize only this subset of characters', \
1249
+ help='''allowlist (string) - Force EasyOCR to recognize only subset of characters.\n
1250
+ Usefor specific problem (E.g. license plate, etc.)''')
1251
+ t0_blocklist = st.text_input('blocklist', value="", max_chars=None, \
1252
+ placeholder='Block subset of character (will be ignored if allowlist is given)', \
1253
+ help='''blocklist (string) - Block subset of character. This argument will be \
1254
+ ignored if allowlist is given.''')
1255
+ t0_detail = st.radio('detail', [0, 1], 1, horizontal=True, \
1256
+ help="detail (int, default = 1) - Set this to 0 for simple output")
1257
+ t0_paragraph = st.radio('paragraph', [True, False], 1, horizontal=True, \
1258
+ help='paragraph (bool, default = False) - Combine result into paragraph')
1259
+ t0_contrast_ths = st.slider('contrast_ths', 0.05, 1., 0.1, step=0.01, \
1260
+ help='''contrast_ths (float, default = 0.1) - Text box with contrast lower than \
1261
+ this value will be passed into model 2 times.\n
1262
+ Firs with original image and second with contrast adjusted to 'adjust_contrast' value.\n
1263
+ The with more confident level will be returned as a result.''')
1264
+ t0_adjust_contrast = st.slider('adjust_contrast', 0.1, 1., 0.5, step=0.1, \
1265
+ help = 'adjust_contrast (float, default = 0.5) - target contrast level for low \
1266
+ contrast text box')
1267
+
1268
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[1], \
1269
+ expanded=False):
1270
+ t1_rec_algorithm = st.selectbox('rec_algorithm', ['CRNN', 'SVTR_LCNet'], 0, \
1271
+ help="Type of recognition algorithm selected. (default=CRNN)")
1272
+ t1_rec_batch_num = st.slider('rec_batch_num', 1, 50, step=1, \
1273
+ help="When performing recognition, the batchsize of forward images. \
1274
+ (default=30)")
1275
+ t1_max_text_length = st.slider('max_text_length', 3, 250, 25, step=1, \
1276
+ help="The maximum text length that the recognition algorithm can recognize. \
1277
+ (default=25)")
1278
+ t1_use_space_char = st.radio('use_space_char', [True, False], 0, horizontal=True, \
1279
+ help="Whether to recognize spaces. (default=TRUE)")
1280
+ t1_drop_score = st.slider('drop_score', 0., 1., 0.25, step=.05, \
1281
+ help="Filter the output by score (from the recognition model), and those \
1282
+ below this score will not be returned. (default=0.5)")
1283
+ """
1284
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[2], \
1285
+ expanded=False):
1286
+ t2_recog = st.selectbox('recog', ['ABINet','CRNN','CRNN_TPS','MASTER', \
1287
+ 'NRTR_1/16-1/8','NRTR_1/8-1/4','RobustScanner','SAR','SAR_CN', \
1288
+ 'SATRN','SATRN_sm','SEG','Tesseract'], 7, \
1289
+ help='Text recognition algorithm. (default = SAR)')
1290
+ st.write("###### *More about text recognition models* 👉 \
1291
+ [here](https://mmocr.readthedocs.io/en/latest/textrecog_models.html)")
1292
+ """
1293
+ #with st.expander("Choose recognition hyperparameters for " + reader_type_list[3], \
1294
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[2], \
1295
+ expanded=False):
1296
+ t3r_psm = st.selectbox('Page segmentation mode (psm)', \
1297
+ [' - Default', \
1298
+ ' 4 Assume a single column of text of variable sizes', \
1299
+ ' 5 Assume a single uniform block of vertically aligned \
1300
+ text', \
1301
+ ' 6 Assume a single uniform block of text', \
1302
+ ' 7 Treat the image as a single text line', \
1303
+ ' 8 Treat the image as a single word', \
1304
+ ' 9 Treat the image as a single word in a circle', \
1305
+ '10 Treat the image as a single character', \
1306
+ '11 Sparse text. Find as much text as possible in no \
1307
+ particular order', \
1308
+ '13 Raw line. Treat the image as a single text line, \
1309
+ bypassing hacks that are Tesseract-specific'])
1310
+ t3r_oem = st.selectbox('OCR engine mode', ['0 Legacy engine only', \
1311
+ '1 Neural nets LSTM engine only', \
1312
+ '2 Legacy + LSTM engines', \
1313
+ '3 Default, based on what is available'], 3)
1314
+ t3r_whitelist = st.text_input('Limit tesseract to recognize only this \
1315
+ characters :', \
1316
+ placeholder='Limit tesseract to recognize only this characters', \
1317
+ help='Example for numbers only : 0123456789')
1318
+
1319
+ submit_reco = st.form_submit_button("Launch recognition")
1320
+
1321
+ if submit_reco:
1322
+ process_detect.clear()
1323
+ ##----------- Process recognition ------------------------------------------
1324
+ reader_ind = reader_type_dict[st.session_state.detect_reader]
1325
+ list_boxes = list_coordinates[reader_ind]
1326
+
1327
+ # Construct the config Tesseract parameter
1328
+ t3r_config = ''
1329
+ psm = t3r_psm[:2]
1330
+ if psm != ' -':
1331
+ t3r_config += '--psm ' + psm.strip()
1332
+ oem = t3r_oem[:1]
1333
+ if oem != '3':
1334
+ t3r_config += ' --oem ' + oem
1335
+ if t3r_whitelist != '':
1336
+ t3r_config += ' -c tessedit_char_whitelist=' + t3r_whitelist
1337
+
1338
+ list_params_rec = \
1339
+ [{'decoder': t0_decoder, 'beamWidth': t0_beamWidth, \
1340
+ 'batch_size': t0_batch_size, 'workers': t0_workers, \
1341
+ 'allowlist': t0_allowlist, 'blocklist': t0_blocklist, \
1342
+ 'detail': t0_detail, 'paragraph': t0_paragraph, \
1343
+ 'contrast_ths': t0_contrast_ths, 'adjust_contrast': t0_adjust_contrast
1344
+ },
1345
+ { **list_params_det[1][1], **{'rec_algorithm': t1_rec_algorithm, \
1346
+ 'rec_batch_num': t1_rec_batch_num, 'max_text_length': t1_max_text_length, \
1347
+ 'use_space_char': t1_use_space_char, 'drop_score': t1_drop_score}, \
1348
+ **{'lang': list_params_det[1][0]}
1349
+ },
1350
+ #{'recog': t2_recog},
1351
+ {'lang': tesserocr_lang, 'config': t3r_config}
1352
+ ]
1353
+
1354
+ show_info2 = st.empty()
1355
+
1356
+ with show_info2.container():
1357
+ st.info("Text recognition in progress ...")
1358
+ df_results, df_results_tesseract, list_reco_status = \
1359
+ process_recog(list_readers, list_images[1], list_boxes, list_params_rec)
1360
+ show_info2.empty()
1361
+
1362
+ st.session_state.df_results = df_results
1363
+ st.session_state.list_boxes = list_boxes
1364
+ st.session_state.df_results_tesseract = df_results_tesseract
1365
+ st.session_state.list_reco_status = list_reco_status
1366
+
1367
+ if 'df_results' in st.session_state:
1368
+ if not st.session_state.df_results.empty:
1369
+ ##----------- Show recognition results ------------------------------------------------------------
1370
+ results_cols = st.session_state.df_results.columns
1371
+ list_col_text = np.arange(1, len(cols_size), 2)
1372
+ list_col_confid = np.arange(2, len(cols_size), 2)
1373
+
1374
+ dict_draw_reco = {'in_image': st.session_state.list_images[1], \
1375
+ 'in_boxes_coordinates': st.session_state.list_boxes, \
1376
+ 'in_list_texts': [st.session_state.df_results[x].to_list() \
1377
+ for x in results_cols[list_col_text]], \
1378
+ 'in_list_confid': [st.session_state.df_results[x].to_list() \
1379
+ for x in results_cols[list_col_confid]], \
1380
+ 'in_dict_back_colors': dict_back_colors, \
1381
+ 'in_df_results_tesseract' : st.session_state.df_results_tesseract, \
1382
+ 'in_reader_type_list': reader_type_list
1383
+ }
1384
+ show_reco = st.empty()
1385
+
1386
+ with st.form("form3"):
1387
+ st.plotly_chart(fig_colorscale, use_container_width=True)
1388
+
1389
+ col_font, col_threshold = st.columns(2)
1390
+
1391
+ col_font.slider('Font scale', 1, 7, 1, step=1, key="font_scale_sld")
1392
+ col_threshold.slider('% confidence threshold for text color change', 40, 100, 64, \
1393
+ step=1, key="conf_threshold_sld")
1394
+ col_threshold.write("(text color is black below this % confidence threshold, \
1395
+ and white above)")
1396
+
1397
+ draw_reco_images(**dict_draw_reco)
1398
+
1399
+ submit_resize = st.form_submit_button("Refresh")
1400
+
1401
+ if submit_resize:
1402
+ draw_reco_images(**dict_draw_reco, \
1403
+ in_font_scale=st.session_state.font_scale_sld, \
1404
+ in_conf_threshold=st.session_state.conf_threshold_sld)
1405
+
1406
+ st.subheader("Recognition details")
1407
+ #with st.expander("Detailed areas for EasyOCR, PPOCR, MMOCR", expanded=True):
1408
+ with st.expander("Detailed areas for EasyOCR, PPOCR", expanded=True):
1409
+ cols = st.columns(cols_size)
1410
+ cols[0].markdown('#### Detected area')
1411
+ for i in range(1, (len(reader_type_list)-1)*2, 2):
1412
+ cols[i].markdown('#### with ' + reader_type_list[i//2])
1413
+
1414
+ for row in st.session_state.df_results.itertuples():
1415
+ #cols = st.columns(1 + len(reader_type_list)*2)
1416
+ cols = st.columns(cols_size)
1417
+ cols[0].image(row.cropped_image, width=150)
1418
+ for ind_col in range(1, len(cols), 2):
1419
+ cols[ind_col].write(getattr(row, results_cols[ind_col]))
1420
+ cols[ind_col+1].write("("+str( \
1421
+ getattr(row, results_cols[ind_col+1]))+"%)")
1422
+
1423
+ st.download_button(
1424
+ label="Download results as CSV file",
1425
+ data=convert_df(st.session_state.df_results),
1426
+ file_name='OCR_comparator_results.csv',
1427
+ mime='text/csv',
1428
+ )
1429
+
1430
+ if not st.session_state.df_results_tesseract.empty:
1431
+ with st.expander("Detailed areas for Tesseract", expanded=False):
1432
+ cols = st.columns([2,2,1])
1433
+ cols[0].markdown('#### Detected area')
1434
+ cols[1].markdown('#### with Tesseract')
1435
+
1436
+ for row in st.session_state.df_results_tesseract.itertuples():
1437
+ cols = st.columns([2,2,1])
1438
+ cols[0].image(row.cropped, width=150)
1439
+ cols[1].write(getattr(row, 'text'))
1440
+ cols[2].write("("+str(getattr(row, 'conf'))+"%)")
1441
+
1442
+ st.download_button(
1443
+ label="Download Tesseract results as CSV file",
1444
+ data=convert_df(st.session_state.df_results),
1445
+ file_name='OCR_comparator_Tesseract_results.csv',
1446
+ mime='text/csv',
1447
+ )
configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yapf:disable
2
+ log_config = dict(
3
+ interval=5,
4
+ hooks=[
5
+ dict(type='TextLoggerHook')
6
+ ])
7
+ # yapf:enable
8
+ dist_params = dict(backend='nccl')
9
+ log_level = 'INFO'
10
+ load_from = None
11
+ resume_from = None
12
+ workflow = [('train', 1)]
13
+
14
+ # disable opencv multithreading to avoid system being overloaded
15
+ opencv_num_threads = 0
16
+ # set multi-process start method as `fork` to speed up the training
17
+ mp_start_method = 'fork'
configs/_base_/det_datasets/ctw1500.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/ctw1500'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2015.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2015'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2017.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2017'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_val.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/synthtext.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'TextDetDataset'
2
+ data_root = 'data/synthtext'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.lmdb',
7
+ loader=dict(
8
+ type='AnnFileLoader',
9
+ repeat=1,
10
+ file_format='lmdb',
11
+ parser=dict(
12
+ type='LineJsonParser',
13
+ keys=['file_name', 'height', 'width', 'annotations'])),
14
+ img_prefix=f'{data_root}/imgs',
15
+ pipeline=None)
16
+
17
+ train_list = [train]
18
+ test_list = [train]
configs/_base_/det_datasets/toy_data.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root = 'tests/data/toy_dataset'
2
+
3
+ # dataset with type='TextDetDataset'
4
+ train1 = dict(
5
+ type='TextDetDataset',
6
+ img_prefix=f'{root}/imgs',
7
+ ann_file=f'{root}/instances_test.txt',
8
+ loader=dict(
9
+ type='AnnFileLoader',
10
+ repeat=4,
11
+ file_format='txt',
12
+ parser=dict(
13
+ type='LineJsonParser',
14
+ keys=['file_name', 'height', 'width', 'annotations'])),
15
+ pipeline=None,
16
+ test_mode=False)
17
+
18
+ # dataset with type='IcdarDataset'
19
+ train2 = dict(
20
+ type='IcdarDataset',
21
+ ann_file=f'{root}/instances_test.json',
22
+ img_prefix=f'{root}/imgs',
23
+ pipeline=None)
24
+
25
+ test = dict(
26
+ type='TextDetDataset',
27
+ img_prefix=f'{root}/imgs',
28
+ ann_file=f'{root}/instances_test.txt',
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=1,
32
+ file_format='txt',
33
+ parser=dict(
34
+ type='LineJsonParser',
35
+ keys=['file_name', 'height', 'width', 'annotations'])),
36
+ pipeline=None,
37
+ test_mode=True)
38
+
39
+ train_list = [train1, train2]
40
+
41
+ test_list = [test]
configs/_base_/det_models/dbnet_r18_fpnc.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=False,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256),
15
+ bbox_head=dict(
16
+ type='DBHead',
17
+ in_channels=256,
18
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
19
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=False,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256),
17
+ bbox_head=dict(
18
+ type='DBHead',
19
+ in_channels=256,
20
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
21
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
22
+ train_cfg=None,
23
+ test_cfg=None)
configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=False,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='FPNC',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ lateral_channels=256,
19
+ asf_cfg=dict(attention_type='ScaleChannelSpatial')),
20
+ bbox_head=dict(
21
+ type='DBHead',
22
+ in_channels=256,
23
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
24
+ postprocessor=dict(
25
+ type='DBPostprocessor', text_repr_type='quad',
26
+ epsilon_ratio=0.002)),
27
+ train_cfg=None,
28
+ test_cfg=None)
configs/_base_/det_models/drrg_r50_fpn_unet.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DRRG',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='DRRGHead',
17
+ in_channels=32,
18
+ text_region_thr=0.3,
19
+ center_region_thr=0.4,
20
+ loss=dict(type='DRRGLoss'),
21
+ postprocessor=dict(type='DRRGPostprocessor', link_thr=0.80)))
configs/_base_/det_models/fcenet_r50_fpn.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=False,
12
+ style='pytorch'),
13
+ neck=dict(
14
+ type='mmdet.FPN',
15
+ in_channels=[512, 1024, 2048],
16
+ out_channels=256,
17
+ add_extra_convs='on_output',
18
+ num_outs=3,
19
+ relu_before_extra_convs=True,
20
+ act_cfg=None),
21
+ bbox_head=dict(
22
+ type='FCEHead',
23
+ in_channels=256,
24
+ scales=(8, 16, 32),
25
+ fourier_degree=5,
26
+ loss=dict(type='FCELoss', num_sample=50),
27
+ postprocessor=dict(
28
+ type='FCEPostprocessor',
29
+ text_repr_type='quad',
30
+ num_reconstr_points=50,
31
+ alpha=1.2,
32
+ beta=1.0,
33
+ score_thr=0.3)))
configs/_base_/det_models/fcenet_r50dcnv2_fpn.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=True,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[512, 1024, 2048],
18
+ out_channels=256,
19
+ add_extra_convs='on_output',
20
+ num_outs=3,
21
+ relu_before_extra_convs=True,
22
+ act_cfg=None),
23
+ bbox_head=dict(
24
+ type='FCEHead',
25
+ in_channels=256,
26
+ scales=(8, 16, 32),
27
+ fourier_degree=5,
28
+ loss=dict(type='FCELoss', num_sample=50),
29
+ postprocessor=dict(
30
+ type='FCEPostprocessor',
31
+ text_repr_type='poly',
32
+ num_reconstr_points=50,
33
+ alpha=1.0,
34
+ beta=2.0,
35
+ score_thr=0.3)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='mmdet.FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[4],
26
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='StandardRoIHead',
37
+ bbox_roi_extractor=dict(
38
+ type='SingleRoIExtractor',
39
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
40
+ out_channels=256,
41
+ featmap_strides=[4, 8, 16, 32]),
42
+ bbox_head=dict(
43
+ type='Shared2FCBBoxHead',
44
+ in_channels=256,
45
+ fc_out_channels=1024,
46
+ roi_feat_size=7,
47
+ num_classes=1,
48
+ bbox_coder=dict(
49
+ type='DeltaXYWHBBoxCoder',
50
+ target_means=[0., 0., 0., 0.],
51
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
52
+ reg_class_agnostic=False,
53
+ loss_cls=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
55
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
56
+ mask_roi_extractor=dict(
57
+ type='SingleRoIExtractor',
58
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
59
+ out_channels=256,
60
+ featmap_strides=[4, 8, 16, 32]),
61
+ mask_head=dict(
62
+ type='FCNMaskHead',
63
+ num_convs=4,
64
+ in_channels=256,
65
+ conv_out_channels=256,
66
+ num_classes=1,
67
+ loss_mask=dict(
68
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
69
+
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1,
80
+ gpu_assign_thr=50),
81
+ sampler=dict(
82
+ type='RandomSampler',
83
+ num=256,
84
+ pos_fraction=0.5,
85
+ neg_pos_ub=-1,
86
+ add_gt_as_proposals=False),
87
+ allowed_border=-1,
88
+ pos_weight=-1,
89
+ debug=False),
90
+ rpn_proposal=dict(
91
+ nms_across_levels=False,
92
+ nms_pre=2000,
93
+ nms_post=1000,
94
+ max_per_img=1000,
95
+ nms=dict(type='nms', iou_threshold=0.7),
96
+ min_bbox_size=0),
97
+ rcnn=dict(
98
+ assigner=dict(
99
+ type='MaxIoUAssigner',
100
+ pos_iou_thr=0.5,
101
+ neg_iou_thr=0.5,
102
+ min_pos_iou=0.5,
103
+ match_low_quality=True,
104
+ ignore_iof_thr=-1),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ text_repr_type='poly',
5
+ backbone=dict(
6
+ type='mmdet.ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ style='pytorch'),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[4],
27
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
36
+ roi_head=dict(
37
+ type='StandardRoIHead',
38
+ bbox_roi_extractor=dict(
39
+ type='SingleRoIExtractor',
40
+ roi_layer=dict(type='RoIAlign', output_size=7, sample_num=0),
41
+ out_channels=256,
42
+ featmap_strides=[4, 8, 16, 32]),
43
+ bbox_head=dict(
44
+ type='Shared2FCBBoxHead',
45
+ in_channels=256,
46
+ fc_out_channels=1024,
47
+ roi_feat_size=7,
48
+ num_classes=80,
49
+ bbox_coder=dict(
50
+ type='DeltaXYWHBBoxCoder',
51
+ target_means=[0., 0., 0., 0.],
52
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
53
+ reg_class_agnostic=False,
54
+ loss_cls=dict(
55
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
56
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
57
+ mask_roi_extractor=dict(
58
+ type='SingleRoIExtractor',
59
+ roi_layer=dict(type='RoIAlign', output_size=14, sample_num=0),
60
+ out_channels=256,
61
+ featmap_strides=[4, 8, 16, 32]),
62
+ mask_head=dict(
63
+ type='FCNMaskHead',
64
+ num_convs=4,
65
+ in_channels=256,
66
+ conv_out_channels=256,
67
+ num_classes=80,
68
+ loss_mask=dict(
69
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1),
80
+ sampler=dict(
81
+ type='RandomSampler',
82
+ num=256,
83
+ pos_fraction=0.5,
84
+ neg_pos_ub=-1,
85
+ add_gt_as_proposals=False),
86
+ allowed_border=-1,
87
+ pos_weight=-1,
88
+ debug=False),
89
+ rpn_proposal=dict(
90
+ nms_across_levels=False,
91
+ nms_pre=2000,
92
+ nms_post=1000,
93
+ max_per_img=1000,
94
+ nms=dict(type='nms', iou_threshold=0.7),
95
+ min_bbox_size=0),
96
+ rcnn=dict(
97
+ assigner=dict(
98
+ type='MaxIoUAssigner',
99
+ pos_iou_thr=0.5,
100
+ neg_iou_thr=0.5,
101
+ min_pos_iou=0.5,
102
+ match_low_quality=True,
103
+ ignore_iof_thr=-1,
104
+ gpu_assign_thr=50),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/panet_r18_fpem_ffm.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PANet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss'),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
22
+
23
+ model_quad = dict(
24
+ type='PANet',
25
+ backbone=dict(
26
+ type='mmdet.ResNet',
27
+ depth=18,
28
+ num_stages=4,
29
+ out_indices=(0, 1, 2, 3),
30
+ frozen_stages=-1,
31
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
32
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
33
+ norm_eval=True,
34
+ style='caffe'),
35
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
36
+ bbox_head=dict(
37
+ type='PANHead',
38
+ in_channels=[128, 128, 128, 128],
39
+ out_channels=6,
40
+ loss=dict(type='PANLoss'),
41
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='quad')),
42
+ train_cfg=None,
43
+ test_cfg=None)
configs/_base_/det_models/panet_r50_fpem_ffm.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='PANet',
3
+ pretrained='torchvision://resnet50',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss', speedup_bbox_thr=32),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/psenet_r50_fpnf.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PSENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNF',
15
+ in_channels=[256, 512, 1024, 2048],
16
+ out_channels=256,
17
+ fusion_type='concat'),
18
+ bbox_head=dict(
19
+ type='PSEHead',
20
+ in_channels=[256],
21
+ out_channels=7,
22
+ loss=dict(type='PSELoss'),
23
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='poly')),
24
+ train_cfg=None,
25
+ test_cfg=None)
26
+
27
+ model_quad = dict(
28
+ type='PSENet',
29
+ backbone=dict(
30
+ type='mmdet.ResNet',
31
+ depth=50,
32
+ num_stages=4,
33
+ out_indices=(0, 1, 2, 3),
34
+ frozen_stages=-1,
35
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
36
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
37
+ norm_eval=True,
38
+ style='caffe'),
39
+ neck=dict(
40
+ type='FPNF',
41
+ in_channels=[256, 512, 1024, 2048],
42
+ out_channels=256,
43
+ fusion_type='concat'),
44
+ bbox_head=dict(
45
+ type='PSEHead',
46
+ in_channels=[256],
47
+ out_channels=7,
48
+ loss=dict(type='PSELoss'),
49
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='quad')),
50
+ train_cfg=None,
51
+ test_cfg=None)
configs/_base_/det_models/textsnake_r50_fpn_unet.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='TextSnake',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='TextSnakeHead',
17
+ in_channels=32,
18
+ loss=dict(type='TextSnakeLoss'),
19
+ postprocessor=dict(
20
+ type='TextSnakePostprocessor', text_repr_type='poly')),
21
+ train_cfg=None,
22
+ test_cfg=None)
configs/_base_/det_pipelines/dbnet_pipeline.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline_r18 = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ImgAug',
15
+ args=[['Fliplr', 0.5],
16
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
17
+ dict(type='EastRandomCrop', target_size=(640, 640)),
18
+ dict(type='DBNetTargets', shrink_ratio=0.4),
19
+ dict(type='Pad', size_divisor=32),
20
+ dict(
21
+ type='CustomFormatBundle',
22
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
23
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
24
+ dict(
25
+ type='Collect',
26
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
27
+ ]
28
+
29
+ test_pipeline_1333_736 = [
30
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
31
+ dict(
32
+ type='MultiScaleFlipAug',
33
+ img_scale=(1333, 736), # used by Resize
34
+ flip=False,
35
+ transforms=[
36
+ dict(type='Resize', keep_ratio=True),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='Pad', size_divisor=32),
39
+ dict(type='ImageToTensor', keys=['img']),
40
+ dict(type='Collect', keys=['img']),
41
+ ])
42
+ ]
43
+
44
+ # for dbnet_r50dcnv2_fpnc
45
+ img_norm_cfg_r50dcnv2 = dict(
46
+ mean=[122.67891434, 116.66876762, 104.00698793],
47
+ std=[58.395, 57.12, 57.375],
48
+ to_rgb=True)
49
+
50
+ train_pipeline_r50dcnv2 = [
51
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
52
+ dict(
53
+ type='LoadTextAnnotations',
54
+ with_bbox=True,
55
+ with_mask=True,
56
+ poly2mask=False),
57
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
58
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
59
+ dict(
60
+ type='ImgAug',
61
+ args=[['Fliplr', 0.5],
62
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
63
+ dict(type='EastRandomCrop', target_size=(640, 640)),
64
+ dict(type='DBNetTargets', shrink_ratio=0.4),
65
+ dict(type='Pad', size_divisor=32),
66
+ dict(
67
+ type='CustomFormatBundle',
68
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
69
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
70
+ dict(
71
+ type='Collect',
72
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
73
+ ]
74
+
75
+ test_pipeline_4068_1024 = [
76
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
77
+ dict(
78
+ type='MultiScaleFlipAug',
79
+ img_scale=(4068, 1024), # used by Resize
80
+ flip=False,
81
+ transforms=[
82
+ dict(type='Resize', keep_ratio=True),
83
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
84
+ dict(type='Pad', size_divisor=32),
85
+ dict(type='ImageToTensor', keys=['img']),
86
+ dict(type='Collect', keys=['img']),
87
+ ])
88
+ ]
configs/_base_/det_pipelines/drrg_pipeline.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
14
+ dict(
15
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
16
+ dict(
17
+ type='RandomCropPolyInstances',
18
+ instance_key='gt_masks',
19
+ crop_ratio=0.8,
20
+ min_side_ratio=0.3),
21
+ dict(
22
+ type='RandomRotatePolyInstances',
23
+ rotate_ratio=0.5,
24
+ max_angle=60,
25
+ pad_with_fixed_color=False),
26
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
27
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
28
+ dict(type='DRRGTargets'),
29
+ dict(type='Pad', size_divisor=32),
30
+ dict(
31
+ type='CustomFormatBundle',
32
+ keys=[
33
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
34
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
35
+ 'gt_cos_map', 'gt_comp_attribs'
36
+ ],
37
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
38
+ dict(
39
+ type='Collect',
40
+ keys=[
41
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
42
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
43
+ 'gt_cos_map', 'gt_comp_attribs'
44
+ ])
45
+ ]
46
+
47
+ test_pipeline = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=(1024, 640), # used by Resize
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
configs/_base_/det_pipelines/fcenet_pipeline.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for icdar2015
5
+ leval_prop_range_icdar2015 = ((0, 0.4), (0.3, 0.7), (0.6, 1.0))
6
+ train_pipeline_icdar2015 = [
7
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
8
+ dict(
9
+ type='LoadTextAnnotations',
10
+ with_bbox=True,
11
+ with_mask=True,
12
+ poly2mask=False),
13
+ dict(
14
+ type='ColorJitter',
15
+ brightness=32.0 / 255,
16
+ saturation=0.5,
17
+ contrast=0.5),
18
+ dict(type='Normalize', **img_norm_cfg),
19
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
20
+ dict(
21
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
22
+ dict(
23
+ type='RandomCropPolyInstances',
24
+ instance_key='gt_masks',
25
+ crop_ratio=0.8,
26
+ min_side_ratio=0.3),
27
+ dict(
28
+ type='RandomRotatePolyInstances',
29
+ rotate_ratio=0.5,
30
+ max_angle=30,
31
+ pad_with_fixed_color=False),
32
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
33
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
34
+ dict(type='Pad', size_divisor=32),
35
+ dict(
36
+ type='FCENetTargets',
37
+ fourier_degree=5,
38
+ level_proportion_range=leval_prop_range_icdar2015),
39
+ dict(
40
+ type='CustomFormatBundle',
41
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
42
+ visualize=dict(flag=False, boundary_key=None)),
43
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
44
+ ]
45
+
46
+ img_scale_icdar2015 = (2260, 2260)
47
+ test_pipeline_icdar2015 = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=img_scale_icdar2015, # used by Resize
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
61
+
62
+ # for ctw1500
63
+ leval_prop_range_ctw1500 = ((0, 0.25), (0.2, 0.65), (0.55, 1.0))
64
+ train_pipeline_ctw1500 = [
65
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
66
+ dict(
67
+ type='LoadTextAnnotations',
68
+ with_bbox=True,
69
+ with_mask=True,
70
+ poly2mask=False),
71
+ dict(
72
+ type='ColorJitter',
73
+ brightness=32.0 / 255,
74
+ saturation=0.5,
75
+ contrast=0.5),
76
+ dict(type='Normalize', **img_norm_cfg),
77
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
78
+ dict(
79
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
80
+ dict(
81
+ type='RandomCropPolyInstances',
82
+ instance_key='gt_masks',
83
+ crop_ratio=0.8,
84
+ min_side_ratio=0.3),
85
+ dict(
86
+ type='RandomRotatePolyInstances',
87
+ rotate_ratio=0.5,
88
+ max_angle=30,
89
+ pad_with_fixed_color=False),
90
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
91
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
92
+ dict(type='Pad', size_divisor=32),
93
+ dict(
94
+ type='FCENetTargets',
95
+ fourier_degree=5,
96
+ level_proportion_range=leval_prop_range_ctw1500),
97
+ dict(
98
+ type='CustomFormatBundle',
99
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
100
+ visualize=dict(flag=False, boundary_key=None)),
101
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
102
+ ]
103
+
104
+ img_scale_ctw1500 = (1080, 736)
105
+ test_pipeline_ctw1500 = [
106
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
107
+ dict(
108
+ type='MultiScaleFlipAug',
109
+ img_scale=img_scale_ctw1500, # used by Resize
110
+ flip=False,
111
+ transforms=[
112
+ dict(type='Resize', keep_ratio=True),
113
+ dict(type='Normalize', **img_norm_cfg),
114
+ dict(type='Pad', size_divisor=32),
115
+ dict(type='ImageToTensor', keys=['img']),
116
+ dict(type='Collect', keys=['img']),
117
+ ])
118
+ ]
configs/_base_/det_pipelines/maskrcnn_pipeline.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
7
+ dict(
8
+ type='ScaleAspectJitter',
9
+ img_scale=None,
10
+ keep_ratio=False,
11
+ resize_type='indep_sample_in_range',
12
+ scale_range=(640, 2560)),
13
+ dict(type='RandomFlip', flip_ratio=0.5),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(
16
+ type='RandomCropInstances',
17
+ target_size=(640, 640),
18
+ mask_type='union_all',
19
+ instance_key='gt_masks'),
20
+ dict(type='Pad', size_divisor=32),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
23
+ ]
24
+
25
+ # for ctw1500
26
+ img_scale_ctw1500 = (1600, 1600)
27
+ test_pipeline_ctw1500 = [
28
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
29
+ dict(
30
+ type='MultiScaleFlipAug',
31
+ img_scale=img_scale_ctw1500, # used by Resize
32
+ flip=False,
33
+ transforms=[
34
+ dict(type='Resize', keep_ratio=True),
35
+ dict(type='RandomFlip'),
36
+ dict(type='Normalize', **img_norm_cfg),
37
+ dict(type='ImageToTensor', keys=['img']),
38
+ dict(type='Collect', keys=['img']),
39
+ ])
40
+ ]
41
+
42
+ # for icdar2015
43
+ img_scale_icdar2015 = (1920, 1920)
44
+ test_pipeline_icdar2015 = [
45
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
46
+ dict(
47
+ type='MultiScaleFlipAug',
48
+ img_scale=img_scale_icdar2015, # used by Resize
49
+ flip=False,
50
+ transforms=[
51
+ dict(type='Resize', keep_ratio=True),
52
+ dict(type='RandomFlip'),
53
+ dict(type='Normalize', **img_norm_cfg),
54
+ dict(type='ImageToTensor', keys=['img']),
55
+ dict(type='Collect', keys=['img']),
56
+ ])
57
+ ]
configs/_base_/det_pipelines/panet_pipeline.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for ctw1500
5
+ img_scale_train_ctw1500 = [(3000, 640)]
6
+ shrink_ratio_train_ctw1500 = (1.0, 0.7)
7
+ target_size_train_ctw1500 = (640, 640)
8
+ train_pipeline_ctw1500 = [
9
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
10
+ dict(
11
+ type='LoadTextAnnotations',
12
+ with_bbox=True,
13
+ with_mask=True,
14
+ poly2mask=False),
15
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
16
+ dict(type='Normalize', **img_norm_cfg),
17
+ dict(
18
+ type='ScaleAspectJitter',
19
+ img_scale=img_scale_train_ctw1500,
20
+ ratio_range=(0.7, 1.3),
21
+ aspect_ratio_range=(0.9, 1.1),
22
+ multiscale_mode='value',
23
+ keep_ratio=False),
24
+ # shrink_ratio is from big to small. The 1st must be 1.0
25
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_ctw1500),
26
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
27
+ dict(type='RandomRotateTextDet'),
28
+ dict(
29
+ type='RandomCropInstances',
30
+ target_size=target_size_train_ctw1500,
31
+ instance_key='gt_kernels'),
32
+ dict(type='Pad', size_divisor=32),
33
+ dict(
34
+ type='CustomFormatBundle',
35
+ keys=['gt_kernels', 'gt_mask'],
36
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
37
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
38
+ ]
39
+
40
+ img_scale_test_ctw1500 = (3000, 640)
41
+ test_pipeline_ctw1500 = [
42
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
43
+ dict(
44
+ type='MultiScaleFlipAug',
45
+ img_scale=img_scale_test_ctw1500, # used by Resize
46
+ flip=False,
47
+ transforms=[
48
+ dict(type='Resize', keep_ratio=True),
49
+ dict(type='Normalize', **img_norm_cfg),
50
+ dict(type='Pad', size_divisor=32),
51
+ dict(type='ImageToTensor', keys=['img']),
52
+ dict(type='Collect', keys=['img']),
53
+ ])
54
+ ]
55
+
56
+ # for icdar2015
57
+ img_scale_train_icdar2015 = [(3000, 736)]
58
+ shrink_ratio_train_icdar2015 = (1.0, 0.5)
59
+ target_size_train_icdar2015 = (736, 736)
60
+ train_pipeline_icdar2015 = [
61
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
62
+ dict(
63
+ type='LoadTextAnnotations',
64
+ with_bbox=True,
65
+ with_mask=True,
66
+ poly2mask=False),
67
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
68
+ dict(type='Normalize', **img_norm_cfg),
69
+ dict(
70
+ type='ScaleAspectJitter',
71
+ img_scale=img_scale_train_icdar2015,
72
+ ratio_range=(0.7, 1.3),
73
+ aspect_ratio_range=(0.9, 1.1),
74
+ multiscale_mode='value',
75
+ keep_ratio=False),
76
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2015),
77
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
78
+ dict(type='RandomRotateTextDet'),
79
+ dict(
80
+ type='RandomCropInstances',
81
+ target_size=target_size_train_icdar2015,
82
+ instance_key='gt_kernels'),
83
+ dict(type='Pad', size_divisor=32),
84
+ dict(
85
+ type='CustomFormatBundle',
86
+ keys=['gt_kernels', 'gt_mask'],
87
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
88
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
89
+ ]
90
+
91
+ img_scale_test_icdar2015 = (1333, 736)
92
+ test_pipeline_icdar2015 = [
93
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
94
+ dict(
95
+ type='MultiScaleFlipAug',
96
+ img_scale=img_scale_test_icdar2015, # used by Resize
97
+ flip=False,
98
+ transforms=[
99
+ dict(type='Resize', keep_ratio=True),
100
+ dict(type='Normalize', **img_norm_cfg),
101
+ dict(type='Pad', size_divisor=32),
102
+ dict(type='ImageToTensor', keys=['img']),
103
+ dict(type='Collect', keys=['img']),
104
+ ])
105
+ ]
106
+
107
+ # for icdar2017
108
+ img_scale_train_icdar2017 = [(3000, 800)]
109
+ shrink_ratio_train_icdar2017 = (1.0, 0.5)
110
+ target_size_train_icdar2017 = (800, 800)
111
+ train_pipeline_icdar2017 = [
112
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
113
+ dict(
114
+ type='LoadTextAnnotations',
115
+ with_bbox=True,
116
+ with_mask=True,
117
+ poly2mask=False),
118
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
119
+ dict(type='Normalize', **img_norm_cfg),
120
+ dict(
121
+ type='ScaleAspectJitter',
122
+ img_scale=img_scale_train_icdar2017,
123
+ ratio_range=(0.7, 1.3),
124
+ aspect_ratio_range=(0.9, 1.1),
125
+ multiscale_mode='value',
126
+ keep_ratio=False),
127
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2017),
128
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
129
+ dict(type='RandomRotateTextDet'),
130
+ dict(
131
+ type='RandomCropInstances',
132
+ target_size=target_size_train_icdar2017,
133
+ instance_key='gt_kernels'),
134
+ dict(type='Pad', size_divisor=32),
135
+ dict(
136
+ type='CustomFormatBundle',
137
+ keys=['gt_kernels', 'gt_mask'],
138
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
139
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
140
+ ]
141
+
142
+ img_scale_test_icdar2017 = (1333, 800)
143
+ test_pipeline_icdar2017 = [
144
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
145
+ dict(
146
+ type='MultiScaleFlipAug',
147
+ img_scale=img_scale_test_icdar2017, # used by Resize
148
+ flip=False,
149
+ transforms=[
150
+ dict(type='Resize', keep_ratio=True),
151
+ dict(type='Normalize', **img_norm_cfg),
152
+ dict(type='Pad', size_divisor=32),
153
+ dict(type='ImageToTensor', keys=['img']),
154
+ dict(type='Collect', keys=['img']),
155
+ ])
156
+ ]
configs/_base_/det_pipelines/psenet_pipeline.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ScaleAspectJitter',
15
+ img_scale=[(3000, 736)],
16
+ ratio_range=(0.5, 3),
17
+ aspect_ratio_range=(1, 1),
18
+ multiscale_mode='value',
19
+ long_size_bound=1280,
20
+ short_size_bound=640,
21
+ resize_type='long_short_bound',
22
+ keep_ratio=False),
23
+ dict(type='PSENetTargets'),
24
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
25
+ dict(type='RandomRotateTextDet'),
26
+ dict(
27
+ type='RandomCropInstances',
28
+ target_size=(640, 640),
29
+ instance_key='gt_kernels'),
30
+ dict(type='Pad', size_divisor=32),
31
+ dict(
32
+ type='CustomFormatBundle',
33
+ keys=['gt_kernels', 'gt_mask'],
34
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
35
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
36
+ ]
37
+
38
+ # for ctw1500
39
+ img_scale_test_ctw1500 = (1280, 1280)
40
+ test_pipeline_ctw1500 = [
41
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
42
+ dict(
43
+ type='MultiScaleFlipAug',
44
+ img_scale=img_scale_test_ctw1500, # used by Resize
45
+ flip=False,
46
+ transforms=[
47
+ dict(type='Resize', keep_ratio=True),
48
+ dict(type='Normalize', **img_norm_cfg),
49
+ dict(type='Pad', size_divisor=32),
50
+ dict(type='ImageToTensor', keys=['img']),
51
+ dict(type='Collect', keys=['img']),
52
+ ])
53
+ ]
54
+
55
+ # for icdar2015
56
+ img_scale_test_icdar2015 = (2240, 2240)
57
+ test_pipeline_icdar2015 = [
58
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
59
+ dict(
60
+ type='MultiScaleFlipAug',
61
+ img_scale=img_scale_test_icdar2015, # used by Resize
62
+ flip=False,
63
+ transforms=[
64
+ dict(type='Resize', keep_ratio=True),
65
+ dict(type='Normalize', **img_norm_cfg),
66
+ dict(type='Pad', size_divisor=32),
67
+ dict(type='ImageToTensor', keys=['img']),
68
+ dict(type='Collect', keys=['img']),
69
+ ])
70
+ ]
configs/_base_/det_pipelines/textsnake_pipeline.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='RandomCropPolyInstances',
15
+ instance_key='gt_masks',
16
+ crop_ratio=0.65,
17
+ min_side_ratio=0.3),
18
+ dict(
19
+ type='RandomRotatePolyInstances',
20
+ rotate_ratio=0.5,
21
+ max_angle=20,
22
+ pad_with_fixed_color=False),
23
+ dict(
24
+ type='ScaleAspectJitter',
25
+ img_scale=[(3000, 736)], # unused
26
+ ratio_range=(0.7, 1.3),
27
+ aspect_ratio_range=(0.9, 1.1),
28
+ multiscale_mode='value',
29
+ long_size_bound=800,
30
+ short_size_bound=480,
31
+ resize_type='long_short_bound',
32
+ keep_ratio=False),
33
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
34
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
35
+ dict(type='TextSnakeTargets'),
36
+ dict(type='Pad', size_divisor=32),
37
+ dict(
38
+ type='CustomFormatBundle',
39
+ keys=[
40
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
41
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
42
+ ],
43
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
44
+ dict(
45
+ type='Collect',
46
+ keys=[
47
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
48
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
49
+ ])
50
+ ]
51
+
52
+ test_pipeline = [
53
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
54
+ dict(
55
+ type='MultiScaleFlipAug',
56
+ img_scale=(1333, 736), # used by Resize
57
+ flip=False,
58
+ transforms=[
59
+ dict(type='Resize', keep_ratio=True),
60
+ dict(type='Normalize', **img_norm_cfg),
61
+ dict(type='Pad', size_divisor=32),
62
+ dict(type='ImageToTensor', keys=['img']),
63
+ dict(type='Collect', keys=['img']),
64
+ ])
65
+ ]
configs/_base_/recog_datasets/MJ_train.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: Syn90k
3
+
4
+ train_root = 'data/mixture/Syn90k'
5
+
6
+ train_img_prefix = f'{train_root}/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file = f'{train_root}/label.lmdb'
8
+
9
+ train = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix,
12
+ ann_file=train_ann_file,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_list = [train]
configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+ # Both annotations are filtered so that
4
+ # only alphanumeric terms are left
5
+
6
+ train_root = 'data/mixture'
7
+
8
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
9
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
10
+
11
+ train1 = dict(
12
+ type='OCRDataset',
13
+ img_prefix=train_img_prefix1,
14
+ ann_file=train_ann_file1,
15
+ loader=dict(
16
+ type='AnnFileLoader',
17
+ repeat=1,
18
+ file_format='lmdb',
19
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
20
+ pipeline=None,
21
+ test_mode=False)
22
+
23
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
24
+ 'synthtext/SynthText_patch_horizontal'
25
+ train_ann_file2 = f'{train_root}/SynthText/alphanumeric_label.lmdb'
26
+
27
+ train2 = {key: value for key, value in train1.items()}
28
+ train2['img_prefix'] = train_img_prefix2
29
+ train2['ann_file'] = train_ann_file2
30
+
31
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_MJ_train.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+
4
+ train_root = 'data/mixture'
5
+
6
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
8
+
9
+ train1 = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix1,
12
+ ann_file=train_ann_file1,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
22
+ 'synthtext/SynthText_patch_horizontal'
23
+ train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
24
+
25
+ train2 = {key: value for key, value in train1.items()}
26
+ train2['img_prefix'] = train_img_prefix2
27
+ train2['ann_file'] = train_ann_file2
28
+
29
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_SA_MJ_real_train.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, SynthAdd, Syn90k
3
+ # Real Dataset: IC11, IC13, IC15, COCO-Test, IIIT5k
4
+
5
+ train_prefix = 'data/mixture'
6
+
7
+ train_img_prefix1 = f'{train_prefix}/icdar_2011'
8
+ train_img_prefix2 = f'{train_prefix}/icdar_2013'
9
+ train_img_prefix3 = f'{train_prefix}/icdar_2015'
10
+ train_img_prefix4 = f'{train_prefix}/coco_text'
11
+ train_img_prefix5 = f'{train_prefix}/IIIT5K'
12
+ train_img_prefix6 = f'{train_prefix}/SynthText_Add'
13
+ train_img_prefix7 = f'{train_prefix}/SynthText'
14
+ train_img_prefix8 = f'{train_prefix}/Syn90k'
15
+
16
+ train_ann_file1 = f'{train_prefix}/icdar_2011/train_label.txt',
17
+ train_ann_file2 = f'{train_prefix}/icdar_2013/train_label.txt',
18
+ train_ann_file3 = f'{train_prefix}/icdar_2015/train_label.txt',
19
+ train_ann_file4 = f'{train_prefix}/coco_text/train_label.txt',
20
+ train_ann_file5 = f'{train_prefix}/IIIT5K/train_label.txt',
21
+ train_ann_file6 = f'{train_prefix}/SynthText_Add/label.txt',
22
+ train_ann_file7 = f'{train_prefix}/SynthText/shuffle_labels.txt',
23
+ train_ann_file8 = f'{train_prefix}/Syn90k/shuffle_labels.txt'
24
+
25
+ train1 = dict(
26
+ type='OCRDataset',
27
+ img_prefix=train_img_prefix1,
28
+ ann_file=train_ann_file1,
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=20,
32
+ file_format='txt',
33
+ parser=dict(
34
+ type='LineStrParser',
35
+ keys=['filename', 'text'],
36
+ keys_idx=[0, 1],
37
+ separator=' ')),
38
+ pipeline=None,
39
+ test_mode=False)
40
+
41
+ train2 = {key: value for key, value in train1.items()}
42
+ train2['img_prefix'] = train_img_prefix2
43
+ train2['ann_file'] = train_ann_file2
44
+
45
+ train3 = {key: value for key, value in train1.items()}
46
+ train3['img_prefix'] = train_img_prefix3
47
+ train3['ann_file'] = train_ann_file3
48
+
49
+ train4 = {key: value for key, value in train1.items()}
50
+ train4['img_prefix'] = train_img_prefix4
51
+ train4['ann_file'] = train_ann_file4
52
+
53
+ train5 = {key: value for key, value in train1.items()}
54
+ train5['img_prefix'] = train_img_prefix5
55
+ train5['ann_file'] = train_ann_file5
56
+
57
+ train6 = dict(
58
+ type='OCRDataset',
59
+ img_prefix=train_img_prefix6,
60
+ ann_file=train_ann_file6,
61
+ loader=dict(
62
+ type='AnnFileLoader',
63
+ repeat=1,
64
+ file_format='txt',
65
+ parser=dict(
66
+ type='LineStrParser',
67
+ keys=['filename', 'text'],
68
+ keys_idx=[0, 1],
69
+ separator=' ')),
70
+ pipeline=None,
71
+ test_mode=False)
72
+
73
+ train7 = {key: value for key, value in train6.items()}
74
+ train7['img_prefix'] = train_img_prefix7
75
+ train7['ann_file'] = train_ann_file7
76
+
77
+ train8 = {key: value for key, value in train6.items()}
78
+ train8['img_prefix'] = train_img_prefix8
79
+ train8['ann_file'] = train_ann_file8
80
+
81
+ train_list = [train1, train2, train3, train4, train5, train6, train7, train8]
configs/_base_/recog_datasets/ST_SA_MJ_train.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+
4
+ train_root = 'data/mixture'
5
+
6
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
8
+
9
+ train1 = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix1,
12
+ ann_file=train_ann_file1,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
22
+ 'synthtext/SynthText_patch_horizontal'
23
+ train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
24
+
25
+ train_img_prefix3 = f'{train_root}/SynthText_Add'
26
+ train_ann_file3 = f'{train_root}/SynthText_Add/label.txt'
27
+
28
+ train2 = {key: value for key, value in train1.items()}
29
+ train2['img_prefix'] = train_img_prefix2
30
+ train2['ann_file'] = train_ann_file2
31
+
32
+ train3 = dict(
33
+ type='OCRDataset',
34
+ img_prefix=train_img_prefix3,
35
+ ann_file=train_ann_file3,
36
+ loader=dict(
37
+ type='AnnFileLoader',
38
+ repeat=1,
39
+ file_format='txt',
40
+ parser=dict(
41
+ type='LineStrParser',
42
+ keys=['filename', 'text'],
43
+ keys_idx=[0, 1],
44
+ separator=' ')),
45
+ pipeline=None,
46
+ test_mode=False)
47
+
48
+ train_list = [train1, train2, train3]
configs/_base_/recog_datasets/ST_charbox_train.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText (with character level boxes)
3
+
4
+ train_img_root = 'data/mixture'
5
+
6
+ train_img_prefix = f'{train_img_root}/SynthText'
7
+
8
+ train_ann_file = f'{train_img_root}/SynthText/instances_train.txt'
9
+
10
+ train = dict(
11
+ type='OCRSegDataset',
12
+ img_prefix=train_img_prefix,
13
+ ann_file=train_ann_file,
14
+ loader=dict(
15
+ type='AnnFileLoader',
16
+ repeat=1,
17
+ file_format='txt',
18
+ parser=dict(
19
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
20
+ pipeline=None,
21
+ test_mode=False)
22
+
23
+ train_list = [train]
configs/_base_/recog_datasets/academic_test.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Testing set, including:
2
+ # Regular Datasets: IIIT5K, SVT, IC13
3
+ # Irregular Datasets: IC15, SVTP, CT80
4
+
5
+ test_root = 'data/mixture'
6
+
7
+ test_img_prefix1 = f'{test_root}/IIIT5K/'
8
+ test_img_prefix2 = f'{test_root}/svt/'
9
+ test_img_prefix3 = f'{test_root}/icdar_2013/'
10
+ test_img_prefix4 = f'{test_root}/icdar_2015/'
11
+ test_img_prefix5 = f'{test_root}/svtp/'
12
+ test_img_prefix6 = f'{test_root}/ct80/'
13
+
14
+ test_ann_file1 = f'{test_root}/IIIT5K/test_label.txt'
15
+ test_ann_file2 = f'{test_root}/svt/test_label.txt'
16
+ test_ann_file3 = f'{test_root}/icdar_2013/test_label_1015.txt'
17
+ test_ann_file4 = f'{test_root}/icdar_2015/test_label.txt'
18
+ test_ann_file5 = f'{test_root}/svtp/test_label.txt'
19
+ test_ann_file6 = f'{test_root}/ct80/test_label.txt'
20
+
21
+ test1 = dict(
22
+ type='OCRDataset',
23
+ img_prefix=test_img_prefix1,
24
+ ann_file=test_ann_file1,
25
+ loader=dict(
26
+ type='AnnFileLoader',
27
+ repeat=1,
28
+ file_format='txt',
29
+ parser=dict(
30
+ type='LineStrParser',
31
+ keys=['filename', 'text'],
32
+ keys_idx=[0, 1],
33
+ separator=' ')),
34
+ pipeline=None,
35
+ test_mode=True)
36
+
37
+ test2 = {key: value for key, value in test1.items()}
38
+ test2['img_prefix'] = test_img_prefix2
39
+ test2['ann_file'] = test_ann_file2
40
+
41
+ test3 = {key: value for key, value in test1.items()}
42
+ test3['img_prefix'] = test_img_prefix3
43
+ test3['ann_file'] = test_ann_file3
44
+
45
+ test4 = {key: value for key, value in test1.items()}
46
+ test4['img_prefix'] = test_img_prefix4
47
+ test4['ann_file'] = test_ann_file4
48
+
49
+ test5 = {key: value for key, value in test1.items()}
50
+ test5['img_prefix'] = test_img_prefix5
51
+ test5['ann_file'] = test_ann_file5
52
+
53
+ test6 = {key: value for key, value in test1.items()}
54
+ test6['img_prefix'] = test_img_prefix6
55
+ test6['ann_file'] = test_ann_file6
56
+
57
+ test_list = [test1, test2, test3, test4, test5, test6]
configs/_base_/recog_datasets/seg_toy_data.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prefix = 'tests/data/ocr_char_ann_toy_dataset/'
2
+
3
+ train = dict(
4
+ type='OCRSegDataset',
5
+ img_prefix=f'{prefix}/imgs',
6
+ ann_file=f'{prefix}/instances_train.txt',
7
+ loader=dict(
8
+ type='AnnFileLoader',
9
+ repeat=100,
10
+ file_format='txt',
11
+ parser=dict(
12
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
13
+ pipeline=None,
14
+ test_mode=True)
15
+
16
+ test = dict(
17
+ type='OCRDataset',
18
+ img_prefix=f'{prefix}/imgs',
19
+ ann_file=f'{prefix}/instances_test.txt',
20
+ loader=dict(
21
+ type='AnnFileLoader',
22
+ repeat=1,
23
+ file_format='txt',
24
+ parser=dict(
25
+ type='LineStrParser',
26
+ keys=['filename', 'text'],
27
+ keys_idx=[0, 1],
28
+ separator=' ')),
29
+ pipeline=None,
30
+ test_mode=True)
31
+
32
+ train_list = [train]
33
+
34
+ test_list = [test]
configs/_base_/recog_datasets/toy_data.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'OCRDataset'
2
+
3
+ root = 'tests/data/ocr_toy_dataset'
4
+ img_prefix = f'{root}/imgs'
5
+ train_anno_file1 = f'{root}/label.txt'
6
+
7
+ train1 = dict(
8
+ type=dataset_type,
9
+ img_prefix=img_prefix,
10
+ ann_file=train_anno_file1,
11
+ loader=dict(
12
+ type='AnnFileLoader',
13
+ repeat=100,
14
+ file_format='txt',
15
+ file_storage_backend='disk',
16
+ parser=dict(
17
+ type='LineStrParser',
18
+ keys=['filename', 'text'],
19
+ keys_idx=[0, 1],
20
+ separator=' ')),
21
+ pipeline=None,
22
+ test_mode=False)
23
+
24
+ train_anno_file2 = f'{root}/label.lmdb'
25
+ train2 = dict(
26
+ type=dataset_type,
27
+ img_prefix=img_prefix,
28
+ ann_file=train_anno_file2,
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=100,
32
+ file_format='lmdb',
33
+ file_storage_backend='disk',
34
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
35
+ pipeline=None,
36
+ test_mode=False)
37
+
38
+ test_anno_file1 = f'{root}/label.lmdb'
39
+ test = dict(
40
+ type=dataset_type,
41
+ img_prefix=img_prefix,
42
+ ann_file=test_anno_file1,
43
+ loader=dict(
44
+ type='AnnFileLoader',
45
+ repeat=1,
46
+ file_format='lmdb',
47
+ file_storage_backend='disk',
48
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
49
+ pipeline=None,
50
+ test_mode=True)
51
+
52
+ train_list = [train1, train2]
53
+
54
+ test_list = [test]
configs/_base_/recog_models/abinet.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # num_chars depends on the configuration of label_convertor. The actual
2
+ # dictionary size is 36 + 1 (<BOS/EOS>).
3
+ # TODO: Automatically update num_chars based on the configuration of
4
+ # label_convertor
5
+ num_chars = 37
6
+ max_seq_len = 26
7
+
8
+ label_convertor = dict(
9
+ type='ABIConvertor',
10
+ dict_type='DICT36',
11
+ with_unknown=False,
12
+ with_padding=False,
13
+ lower=True,
14
+ )
15
+
16
+ model = dict(
17
+ type='ABINet',
18
+ backbone=dict(type='ResNetABI'),
19
+ encoder=dict(
20
+ type='ABIVisionModel',
21
+ encoder=dict(
22
+ type='TransformerEncoder',
23
+ n_layers=3,
24
+ n_head=8,
25
+ d_model=512,
26
+ d_inner=2048,
27
+ dropout=0.1,
28
+ max_len=8 * 32,
29
+ ),
30
+ decoder=dict(
31
+ type='ABIVisionDecoder',
32
+ in_channels=512,
33
+ num_channels=64,
34
+ attn_height=8,
35
+ attn_width=32,
36
+ attn_mode='nearest',
37
+ use_result='feature',
38
+ num_chars=num_chars,
39
+ max_seq_len=max_seq_len,
40
+ init_cfg=dict(type='Xavier', layer='Conv2d')),
41
+ ),
42
+ decoder=dict(
43
+ type='ABILanguageDecoder',
44
+ d_model=512,
45
+ n_head=8,
46
+ d_inner=2048,
47
+ n_layers=4,
48
+ dropout=0.1,
49
+ detach_tokens=True,
50
+ use_self_attn=False,
51
+ pad_idx=num_chars - 1,
52
+ num_chars=num_chars,
53
+ max_seq_len=max_seq_len,
54
+ init_cfg=None),
55
+ fuser=dict(
56
+ type='ABIFuser',
57
+ d_model=512,
58
+ num_chars=num_chars,
59
+ init_cfg=None,
60
+ max_seq_len=max_seq_len,
61
+ ),
62
+ loss=dict(
63
+ type='ABILoss',
64
+ enc_weight=1.0,
65
+ dec_weight=1.0,
66
+ fusion_weight=1.0,
67
+ num_classes=num_chars),
68
+ label_convertor=label_convertor,
69
+ max_seq_len=max_seq_len,
70
+ iter_size=3)
configs/_base_/recog_models/crnn.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
3
+
4
+ model = dict(
5
+ type='CRNNNet',
6
+ preprocessor=None,
7
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
8
+ encoder=None,
9
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
10
+ loss=dict(type='CTCLoss'),
11
+ label_convertor=label_convertor,
12
+ pretrained=None)
configs/_base_/recog_models/crnn_tps.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model
2
+ label_convertor = dict(
3
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
4
+
5
+ model = dict(
6
+ type='CRNNNet',
7
+ preprocessor=dict(
8
+ type='TPSPreprocessor',
9
+ num_fiducial=20,
10
+ img_size=(32, 100),
11
+ rectified_img_size=(32, 100),
12
+ num_img_channel=1),
13
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
14
+ encoder=None,
15
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
16
+ loss=dict(type='CTCLoss'),
17
+ label_convertor=label_convertor,
18
+ pretrained=None)
configs/_base_/recog_models/master.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT90', with_unknown=True)
3
+
4
+ model = dict(
5
+ type='MASTER',
6
+ backbone=dict(
7
+ type='ResNet',
8
+ in_channels=3,
9
+ stem_channels=[64, 128],
10
+ block_cfgs=dict(
11
+ type='BasicBlock',
12
+ plugins=dict(
13
+ cfg=dict(
14
+ type='GCAModule',
15
+ ratio=0.0625,
16
+ n_head=1,
17
+ pooling_type='att',
18
+ is_att_scale=False,
19
+ fusion_type='channel_add'),
20
+ position='after_conv2')),
21
+ arch_layers=[1, 2, 5, 3],
22
+ arch_channels=[256, 256, 512, 512],
23
+ strides=[1, 1, 1, 1],
24
+ plugins=[
25
+ dict(
26
+ cfg=dict(type='Maxpool2d', kernel_size=2, stride=(2, 2)),
27
+ stages=(True, True, False, False),
28
+ position='before_stage'),
29
+ dict(
30
+ cfg=dict(type='Maxpool2d', kernel_size=(2, 1), stride=(2, 1)),
31
+ stages=(False, False, True, False),
32
+ position='before_stage'),
33
+ dict(
34
+ cfg=dict(
35
+ type='ConvModule',
36
+ kernel_size=3,
37
+ stride=1,
38
+ padding=1,
39
+ norm_cfg=dict(type='BN'),
40
+ act_cfg=dict(type='ReLU')),
41
+ stages=(True, True, True, True),
42
+ position='after_stage')
43
+ ],
44
+ init_cfg=[
45
+ dict(type='Kaiming', layer='Conv2d'),
46
+ dict(type='Constant', val=1, layer='BatchNorm2d'),
47
+ ]),
48
+ encoder=None,
49
+ decoder=dict(
50
+ type='MasterDecoder',
51
+ d_model=512,
52
+ n_head=8,
53
+ attn_drop=0.,
54
+ ffn_drop=0.,
55
+ d_inner=2048,
56
+ n_layers=3,
57
+ feat_pe_drop=0.2,
58
+ feat_size=6 * 40),
59
+ loss=dict(type='TFLoss', reduction='mean'),
60
+ label_convertor=label_convertor,
61
+ max_seq_len=30)
configs/_base_/recog_models/nrtr_modality_transform.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
3
+
4
+ model = dict(
5
+ type='NRTR',
6
+ backbone=dict(type='NRTRModalityTransform'),
7
+ encoder=dict(type='NRTREncoder', n_layers=12),
8
+ decoder=dict(type='NRTRDecoder'),
9
+ loss=dict(type='TFLoss'),
10
+ label_convertor=label_convertor,
11
+ max_seq_len=40)