|
import re |
|
import streamlit as st |
|
from modelcards import CardData, ModelCard |
|
from markdownTagExtract import tag_checker,listToString,to_markdown |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def file_upload(): |
|
bytes_data = st.session_state.markdown_upload |
|
return bytes_data |
|
|
|
|
|
|
|
model_card_md = file_upload() |
|
model_card_md = model_card_md |
|
|
|
metadata_re = re.compile("^---(.*?)---", re.DOTALL) |
|
header_re = re.compile("^\s*# (.*)", re.MULTILINE) |
|
subheader_re = re.compile("^\s*## (.*)", re.MULTILINE) |
|
subsubheader_re = re.compile("^\s*### (.*)", re.MULTILINE) |
|
subsubsubheader_re = re.compile("^\s*#### (.*)", re.MULTILINE) |
|
|
|
|
|
|
|
|
|
|
|
|
|
key_value_re = re.compile("^\s*([*_]{2}[^*_]+[*_]{2})([^\n]*)", re.MULTILINE) |
|
|
|
|
|
list_item_re = re.compile("^\s*[-*+]\s+.*", re.MULTILINE) |
|
|
|
enum_re = re.compile("^\s*[0-9].*", re.MULTILINE) |
|
table_re = re.compile("^\s*\|.*", re.MULTILINE) |
|
text_item_re = re.compile("^\s*[A-Za-z(](.*)", re.MULTILINE) |
|
|
|
|
|
italicized_text_item_re = re.compile( |
|
"^[_*][^_*\s].*\n?.*[^_*][_*]$", flags=re.MULTILINE |
|
) |
|
tag_re = re.compile("^\s*<.*", re.MULTILINE) |
|
image_re = re.compile("!\[.*\]\(.*\)", re.MULTILINE) |
|
|
|
|
|
subheader_re_dict = {} |
|
subheader_re_dict[header_re] = subheader_re |
|
subheader_re_dict[subheader_re] = subsubheader_re |
|
subheader_re_dict[subsubheader_re] = subsubsubheader_re |
|
|
|
|
|
def get_metadata(section_text): |
|
return list(metadata_re.finditer(section_text)) |
|
|
|
|
|
def find_images(section_text): |
|
return list(image_re.finditer(section_text)) |
|
|
|
|
|
def find_tags(section_text): |
|
return list(tag_re.finditer(section_text)) |
|
|
|
|
|
def find_tables(section_text): |
|
return list(table_re.finditer(section_text)) |
|
|
|
|
|
def find_enums(section_text): |
|
return list(enum_re.finditer(section_text)) |
|
|
|
|
|
|
|
def find_key_values(section_text): |
|
return list(key_value_re.finditer(section_text)) |
|
|
|
|
|
def find_lists(section_text): |
|
|
|
return list(list_item_re.finditer(section_text)) |
|
|
|
|
|
def find_texts(section_text): |
|
|
|
basic_text = list(text_item_re.finditer(section_text)) |
|
ital_text = list(italicized_text_item_re.finditer(section_text)) |
|
free_text = basic_text + ital_text |
|
return free_text |
|
|
|
|
|
def find_headers(full_text): |
|
headers = list(header_re.finditer(full_text)) |
|
subheaders = list(subheader_re.finditer(full_text)) |
|
subsubheaders = list(subsubheader_re.finditer(full_text)) |
|
subsubsubheaders = list(subsubsubheader_re.finditer(full_text)) |
|
return (headers, subheaders, subsubheaders, subsubsubheaders) |
|
|
|
|
|
metadata_list = get_metadata(model_card_md) |
|
if metadata_list != []: |
|
metadata_end = metadata_list[-1].span()[-1] |
|
print("Metadata extracted") |
|
|
|
|
|
model_card_md = model_card_md[metadata_end:] |
|
else: |
|
print("No metadata found") |
|
|
|
|
|
headers_list = find_headers(model_card_md) |
|
print("Headers extracted") |
|
|
|
headers = headers_list[0] |
|
|
|
subheaders = headers_list[1] |
|
|
|
subsubheaders = headers_list[2] |
|
|
|
subsubsubheaders = headers_list[3] |
|
|
|
|
|
lists_list = find_lists(model_card_md) |
|
print("Bulleted lists extracted") |
|
|
|
enums_list = find_enums(model_card_md) |
|
print("Enumerated lists extracted") |
|
|
|
key_value_list = find_key_values(model_card_md) |
|
print("Key values extracted") |
|
|
|
tables_list = find_tables(model_card_md) |
|
print("Tables extracted") |
|
|
|
tags_list = find_tags(model_card_md) |
|
print("Markup tags extracted") |
|
|
|
images_list = find_images(model_card_md) |
|
print("Images extracted") |
|
|
|
|
|
texts_list = find_texts(model_card_md) |
|
print("Free text extracted") |
|
|
|
|
|
|
|
|
|
|
|
LIST_ITEM = "List item" |
|
KEY_VALUE = "Key: Value" |
|
FREE_TEXT = "Free text" |
|
ENUM_LIST_ITEM = "Enum item" |
|
TABLE_ITEM = "Table item" |
|
TAG_ITEM = "Markup tag" |
|
IMAGE_ITEM = "Image" |
|
|
|
|
|
def create_span_dict(match_list, match_type): |
|
""" |
|
Creates a dictionary made out of all the spans. |
|
This is useful for knowing which types to fill out with what in the app. |
|
Also useful for checking if there are spans in the .md file that we've missed. |
|
""" |
|
span_dict = {} |
|
for match in match_list: |
|
if len(match.group().strip()) > 0: |
|
span_dict[(match.span())] = (match.group(), match_type) |
|
return span_dict |
|
|
|
|
|
metadata_span_dict = create_span_dict(metadata_list, "Metadata") |
|
|
|
header_span_dict = create_span_dict(headers, "# Header") |
|
subheader_span_dict = create_span_dict(subheaders, "## Subheader") |
|
subsubheader_span_dict = create_span_dict(subsubheaders, "### Subsubheader") |
|
subsubsubheader_span_dict = create_span_dict(subsubsubheaders, "#### Subsubsubheader") |
|
key_value_span_dict = create_span_dict(key_value_list, KEY_VALUE) |
|
lists_span_dict = create_span_dict(lists_list, LIST_ITEM) |
|
enums_span_dict = create_span_dict(enums_list, ENUM_LIST_ITEM) |
|
tables_span_dict = create_span_dict(tables_list, TABLE_ITEM) |
|
tags_span_dict = create_span_dict(tags_list, TAG_ITEM) |
|
images_span_dict = create_span_dict(images_list, IMAGE_ITEM) |
|
texts_span_dict = create_span_dict(texts_list, FREE_TEXT) |
|
|
|
|
|
|
|
all_spans_dict = {} |
|
all_spans_dict["headers"] = header_span_dict |
|
all_spans_dict["subheaders"] = subheader_span_dict |
|
all_spans_dict["subsubheaders"] = subsubheader_span_dict |
|
all_spans_dict["subsubsubheaders"] = subsubsubheader_span_dict |
|
all_spans_dict[LIST_ITEM] = lists_span_dict |
|
all_spans_dict[KEY_VALUE] = key_value_span_dict |
|
all_spans_dict[TABLE_ITEM] = tables_span_dict |
|
all_spans_dict[ENUM_LIST_ITEM] = enums_span_dict |
|
all_spans_dict[TAG_ITEM] = tags_span_dict |
|
all_spans_dict[IMAGE_ITEM] = images_span_dict |
|
all_spans_dict[FREE_TEXT] = texts_span_dict |
|
|
|
|
|
def get_sorted_spans(spans_dict): |
|
merged_spans = {} |
|
for span_dict in spans_dict.values(): |
|
merged_spans.update(span_dict) |
|
sorted_spans = sorted(merged_spans) |
|
return sorted_spans, merged_spans |
|
|
|
|
|
sorted_spans, merged_spans = get_sorted_spans(all_spans_dict) |
|
|
|
|
|
if sorted_spans[0][0] != 0: |
|
print("FYI, our spans don't start at the start of the file.") |
|
print("We did not catch this start:") |
|
print(model_card_md[: sorted_spans[0][0]]) |
|
|
|
for idx in range(len(sorted_spans) - 1): |
|
last_span_end = sorted_spans[idx][1] |
|
new_span_start = sorted_spans[idx + 1][0] |
|
if new_span_start > last_span_end + 1: |
|
start_nonparse = sorted_spans[idx] |
|
end_nonparse = sorted_spans[idx + 1] |
|
text = model_card_md[start_nonparse[1] : end_nonparse[0]] |
|
if text.strip(): |
|
print("Found an unparsed span in the file:") |
|
print(start_nonparse) |
|
print(" ---> ") |
|
print(end_nonparse) |
|
print(text) |
|
|
|
|
|
def section_map_to_help_text(text_retrieved): |
|
|
|
presit_states = { |
|
"## Model Details": "Give an overview of your model, the relevant research paper, who trained it, etc.", |
|
"## How to Get Started with the Model": "Give an overview of how to get started with the model", |
|
"## Limitations and Biases": "Provide an overview of the possible Limitations and Risks that may be associated with this model", |
|
"## Uses": "Detail the potential uses, intended use and out-of-scope uses for this model", |
|
"## Training": "Provide an overview of the Training Data and Training Procedure for this model", |
|
"## Evaluation Results": "Detail the Evaluation Results for this model", |
|
"## Environmental Impact": "Provide an estimate for the carbon emissions: Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here.", |
|
"## Citation Information": "How to best cite the model authors", |
|
"## Glossary": "If relevant, include terms and calculations in this section that can help readers understand the model or model card.", |
|
"## More Information": "Any additional information", |
|
"## Model Card Authors": "This section provides another layer of transparency and accountability. Whose views is this model card representing? How many voices were included in its construction? Etc.", |
|
"Model Card Contact": "Mediums to use, in order to contact the model creators", |
|
"## Technical Specifications": " Additional technical information", |
|
'## Model Examination': " Examining the model", |
|
} |
|
|
|
for key in presit_states: |
|
if key == text_retrieved: |
|
return presit_states(key) |
|
|
|
|
|
def section_map_to_persist(text_retrieved): |
|
|
|
presit_states = { |
|
"Model_details_text": "## Model Details", |
|
"Model_how_to": "## How to Get Started with the Model", |
|
"Model_Limits_n_Risks": "## Limitations and Biases", |
|
"Model_uses": "## Uses", |
|
"Model_training": "## Training", |
|
"Model_Eval": "## Evaluation Results", |
|
"Model_carbon": "## Environmental Impact", |
|
"Model_cite": "## Citation Information", |
|
"Glossary": "## Glossary", |
|
"More_info": "## More Information", |
|
"Model_card_authors": "## Model Card Authors", |
|
"Model_card_contact": "## Model Card Contact", |
|
"Technical_specs": "## Technical specifications", |
|
"Model_examin": "## Model Examination", |
|
} |
|
|
|
for key in presit_states: |
|
if presit_states[key] == text_retrieved: |
|
return key |
|
|
|
|
|
def main(): |
|
|
|
print(extract_it("Model_details_text")) |
|
|
|
|
|
def extract_headers(): |
|
headers = {} |
|
subheaders = {} |
|
subsubheaders = {} |
|
subsubsubheaders = {} |
|
previous = (None, None, None, None) |
|
|
|
for s in sorted_spans: |
|
if merged_spans[s][1] == "# Header": |
|
headers[s] = (sorted_spans.index(s), previous[0]) |
|
previous = (sorted_spans.index(s), previous[1], previous[2], previous[3]) |
|
if merged_spans[s][1] == "## Subheader": |
|
subheaders[s] = (sorted_spans.index(s), previous[1]) |
|
previous = (previous[0], sorted_spans.index(s), previous[2], previous[3]) |
|
if merged_spans[s][1] == "### Subsubheader": |
|
subsubheaders[s] = (sorted_spans.index(s), previous[2]) |
|
previous = (previous[0], previous[1], sorted_spans.index(s), previous[3]) |
|
if merged_spans[s][1] == "#### Subsubsubheader": |
|
subsubsubheaders[s] = (sorted_spans.index(s), previous[3]) |
|
previous = (previous[0], previous[1], previous[2], sorted_spans.index(s)) |
|
|
|
return headers, subheaders, subsubheaders, subsubsubheaders |
|
|
|
|
|
def stringify(): |
|
headers, subheaders, subsubheaders, subsubsubheaders = extract_headers() |
|
headers_strings = {} |
|
subheaders_strings = {} |
|
subsubheaders_strings = {} |
|
subsubsubheaders_strings = {} |
|
|
|
first = None |
|
for i in headers: |
|
if headers[i][1] == None: |
|
continue |
|
sub_spans = sorted_spans[headers[i][1] : headers[i][0]] |
|
lines = [] |
|
for x in sub_spans: |
|
lines.append(merged_spans[x][0]) |
|
try: |
|
name = lines[0] |
|
except: |
|
name = "Model Details" |
|
lines = "".join(lines) |
|
|
|
|
|
headers_strings[ |
|
name.replace("\n# ", "") |
|
.replace(" ", "") |
|
.replace(" ", "") |
|
.replace("\n", "") |
|
.replace("{{", "") |
|
.replace("}}", "") |
|
] = lines |
|
first = i |
|
|
|
first = None |
|
for i in subheaders: |
|
if subheaders[i][1] == None: |
|
continue |
|
sub_spans = sorted_spans[subheaders[i][1] : subheaders[i][0]] |
|
lines = [] |
|
for x in sub_spans: |
|
if merged_spans[x][1] == "## Subheader" and first == None: |
|
break |
|
elif merged_spans[x][1] == "# Header": |
|
break |
|
else: |
|
lines.append(merged_spans[x][0]) |
|
try: |
|
name = lines[0] |
|
except: |
|
name = "Model Details" |
|
lines = "".join(lines) |
|
|
|
|
|
subheaders_strings[ |
|
name.replace("\n# ", "").replace(" ", "").replace(" ", "") |
|
] = lines |
|
first = i |
|
|
|
first = None |
|
for i in subsubheaders: |
|
if subsubheaders[i][1] == None: |
|
continue |
|
sub_spans = sorted_spans[subsubheaders[i][1] : subsubheaders[i][0]] |
|
lines = [] |
|
for x in sub_spans: |
|
if merged_spans[x][1] == "## Subheader" or ( |
|
merged_spans[x][1] == "### Subsubheader" and first == None |
|
): |
|
break |
|
else: |
|
lines.append(merged_spans[x][0]) |
|
lines = "".join(lines) |
|
|
|
subsubheaders_strings[ |
|
merged_spans[i][0].replace("\n", "").replace("### ", "").replace(" ", "") |
|
] = lines |
|
first = i |
|
|
|
for i in subsubsubheaders: |
|
if subsubsubheaders[i][1] == None: |
|
continue |
|
sub_spans = sorted_spans[subsubsubheaders[i][1] : subsubsubheaders[i][0]] |
|
lines = [] |
|
for x in sub_spans: |
|
if ( |
|
merged_spans[x][1] == "## Subheader" |
|
or merged_spans[x][1] == "### Subsubheader" |
|
): |
|
break |
|
else: |
|
lines.append(merged_spans[x][0]) |
|
lines = "".join(lines) |
|
|
|
subsubsubheaders_strings[ |
|
merged_spans[i][0].replace("#### ", "").replace("**", "").replace("\n", "") |
|
] = lines |
|
|
|
return ( |
|
headers_strings, |
|
subheaders_strings, |
|
subsubheaders_strings, |
|
subsubsubheaders_strings, |
|
) |
|
|
|
|
|
def extract_it(text_to_retrieve): |
|
print("Span\t\tType\t\tText") |
|
print("-------------------------------------") |
|
found_subheader = False |
|
current_subheader = " " |
|
page_state = " " |
|
help_text = " " |
|
|
|
|
|
( |
|
headers_strings, |
|
subheaders_strings, |
|
subsubheaders_strings, |
|
subsubsubheaders_strings, |
|
) = stringify() |
|
|
|
h_keys = list(headers_strings.keys()) |
|
sh_keys = list(subheaders_strings.keys()) |
|
ssh_keys = list(subsubheaders_strings.keys()) |
|
sssh_keys = list(subsubsubheaders_strings.keys()) |
|
|
|
needed = [ |
|
"model details", |
|
"howto", |
|
"limitations", |
|
"uses", |
|
"training", |
|
"evaluation", |
|
"environmental", |
|
"citation", |
|
"glossary", |
|
"more information", |
|
"authors", |
|
"contact", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
info_strings = { |
|
"model details": "", |
|
"howto": "", |
|
"limitations": "", |
|
"uses": "", |
|
"training": "", |
|
"evaluation": "", |
|
"environmental": "", |
|
"citation": "", |
|
"glossary": "", |
|
"more information": "", |
|
"authors": "", |
|
"contact": "", |
|
} |
|
|
|
for x in needed: |
|
for l in h_keys: |
|
if x in l.lower(): |
|
info_strings[x] = info_strings[x] + headers_strings[l] |
|
for i in sh_keys: |
|
if x in i.lower(): |
|
info_strings[x] = info_strings[x] + subheaders_strings[i] |
|
for z in ssh_keys: |
|
try: |
|
if x in z.lower(): |
|
info_strings[x] = info_strings[x] + subsubheaders_strings[z] |
|
except: |
|
continue |
|
for y in sssh_keys: |
|
try: |
|
if x in y.lower(): |
|
info_strings[x] = info_strings[x] + subsubsubheaders_strings[y] |
|
except: |
|
continue |
|
|
|
extracted_info = { |
|
"Model_details_text": info_strings["model details"], |
|
"Model_how_to": info_strings["howto"], |
|
"Model_Limits_n_Risks": info_strings["limitations"], |
|
"Model_uses": info_strings["uses"], |
|
"Model_training": info_strings["training"], |
|
"Model_Eval": info_strings["evaluation"], |
|
"Model_carbon": info_strings["environmental"], |
|
"Model_cite": info_strings["citation"], |
|
"Glossary": info_strings["glossary"], |
|
"More_info": info_strings["more information"], |
|
"Model_card_authors": info_strings["authors"], |
|
"Model_card_contact": info_strings["contact"], |
|
"Technical_specs": "## Technical specifications", |
|
"Model_examin": "## Model Examination", |
|
} |
|
|
|
|
|
|
|
new_t = extracted_info[text_to_retrieve] + " " |
|
|
|
return(new_t) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
main() |
|
|