openvino_notebooks / .ci /table_of_content.py
malvika2003's picture
Upload folder using huggingface_hub
db5855f verified
raw
history blame
5.62 kB
import json
import pathlib
import argparse
import re
TABLE_OF_CONTENT = r"#+\s+Table of content:?"
def find_tc_in_cell(cell):
tc_cell = None
tc_line_number = None
for i, line in enumerate(cell["source"]):
if re.match(TABLE_OF_CONTENT, line):
tc_cell = cell
tc_line_number = i
break
return tc_cell, tc_line_number
def create_title_for_tc(title):
title_for_tc = title.lstrip("#").lstrip()
title_for_tc = re.sub(r"[\[\]\n]", "", title_for_tc)
title_for_tc = re.sub(r"\(http.*\)", "", title_for_tc)
return title_for_tc
def create_link_for_tc(title):
link = re.sub(r"[`$^]", "", title)
link = link.replace(" ", "-")
return link
def remove_old_tc(cell, idx):
if cell is not None:
for line in cell["source"][idx:]:
if re.match(r"\s*-\s*\[.*\]\(#.*\).*", line) or re.match(TABLE_OF_CONTENT, line):
cell["source"].remove(line)
return cell
def get_tc_line(title, title_for_tc, link, tc_list, titles_list):
# calc indents for Table of content
try:
indents_num = (title.index(" ") - 2) * 4
except:
indents_num = -1
if len(tc_list) == 0 or indents_num < 0:
# when first list item have more than 1 indents the alignment would be broken
indents_num = 0
elif indents_num - tc_list[-1].index("-") > 4:
# when previous list item have n indents and current have n+4+1 it broke the alignment
indents_num = tc_list[-1].index("-") + 4
elif indents_num != tc_list[-1].index("-") and title.index(" ") == titles_list[-1].index(" "):
# when we have several titles with same wrong alignments
indents_num = tc_list[-1].index("-")
indents = " " * indents_num + "-" + " "
line = f"{indents}[{title_for_tc}](#{link})\n"
return line
def is_ref_to_top_exists(cell, idx):
ref_exists = False
for row in cell[idx + 1 :]:
row = row.strip()
if "[back to top ⬆️](#Table-of-content" in row:
ref_exists = True
break
elif row != "":
# content of block started
break
return ref_exists
def is_markdown(cell):
return "markdown" == cell["cell_type"]
def is_title(line):
return line.strip().startswith("#") and line.strip().lstrip("#").lstrip()
def generate_table_of_content(notebook_path: pathlib.Path):
table_of_content = []
table_of_content_cell = None
table_of_content_cell_idx = None
with open(notebook_path, "r", encoding="utf-8") as notebook_file:
notebook_json = json.load(notebook_file)
if not notebook_json["cells"]:
return
table_of_content_cell, table_of_content_cell_idx = find_tc_in_cell(notebook_json["cells"][0])
all_titles = []
for cell in filter(is_markdown, notebook_json["cells"][1:]):
if table_of_content_cell is None:
table_of_content_cell, table_of_content_cell_idx = find_tc_in_cell(cell)
if not table_of_content_cell is None:
continue
titles = [line for line in cell["source"] if is_title(line)]
for title in titles:
idx = cell["source"].index(title)
if not is_ref_to_top_exists(cell["source"], idx):
if not title.endswith("\n"):
cell["source"].insert(idx, title + "\n")
cell["source"].insert(idx + 1, "[back to top ⬆️](#Table-of-contents:)\n")
cell["source"].insert(idx + 2, "")
title = title.strip()
title_for_tc = create_title_for_tc(title)
link_for_tc = create_link_for_tc(title_for_tc)
new_line = get_tc_line(title, title_for_tc, link_for_tc, table_of_content, all_titles)
if table_of_content.count(new_line) > 1:
print(
f'WARINING: the title "{title_for_tc}" has already used in titles.\n'
+ "Navigation will work inccorect, the link will only point to "
+ "the first encountered title"
)
table_of_content.append(new_line)
all_titles.append(title)
table_of_content = ["\n", "#### Table of contents:\n\n"] + table_of_content + ["\n"]
if table_of_content_cell is not None:
table_of_content_cell = remove_old_tc(table_of_content_cell, table_of_content_cell_idx)
if table_of_content_cell is not None:
table_of_content_cell["source"].extend(table_of_content)
else:
notebook_json["cells"][0]["source"].extend(table_of_content)
with open(notebook_path, "w", encoding="utf-8") as in_f:
json.dump(notebook_json, in_f, ensure_ascii=False, indent=1)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
"--source",
help="Please, specify notebook or folder with notebooks.\
Table of content will be added or modified in each.",
required=True,
)
args = parser.parse_args()
path_to_source = pathlib.Path(args.source)
if not path_to_source.exists():
print(f"Incorrect path to notebook(s) {path_to_source}")
exit()
elif path_to_source.is_file():
generate_table_of_content(path_to_source)
elif path_to_source.is_dir():
for notebook in path_to_source.glob("**/*.ipynb"):
generate_table_of_content(notebook)