diff --git a/scripts/doc_parsing_utils.py b/scripts/doc_parsing_utils.py new file mode 100644 index 0000000000..ff9033660d --- /dev/null +++ b/scripts/doc_parsing_utils.py @@ -0,0 +1,444 @@ +import re +from typing import TypedDict + +CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$") +CODE_INCLUDE_PLACEHOLDER = "" + +HEADER_WITH_PERMALINK_RE = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$") +HEADER_LINE_RE = re.compile(r"^(#{1,6}) (.+?)(?:\s*\{\s*(#.*)\s*\})?\s*$") + +TIANGOLO_COM = "https://fastapi.tiangolo.com" + +MARKDOWN_LINK_RE = re.compile( + r"(?.*?)\]" # link text (non-greedy) + r"\(" + r"(?P[^)\s]+)" # url (no spaces and `)`) + r'(?:\s+["\'](?P.*?)["\'])?' # optional title in "" or '' + r"\)" + r"(?:\s*\{(?P<attrs>[^}]*)\})?" # optional attributes in {} +) + +HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>") +HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>") +HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>") +HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2') + + +class CodeIncludeInfo(TypedDict): + line_no: int + line: str + + +class HeaderPermalinkInfo(TypedDict): + line_no: int + hashes: str + permalink: str + + +class MarkdownLinkInfo(TypedDict): + line_no: int + url: str + text: str + title: str | None + attributes: str | None + + +class HTMLLinkAttribute(TypedDict): + name: str + quote: str + value: str + + +class HtmlLinkInfo(TypedDict): + line_no: int + full_tag: str + attributes: list[HTMLLinkAttribute] + text: str + + +# Code includes +# ----------------------------------------------------------------------------------------- + + +def extract_code_includes(lines: list[str]) -> list[CodeIncludeInfo]: + """ + Exctract lines that contain code includes. + + Return list of CodeIncludeInfo namedtuples, where each tuple contains: + - `line_no` - line number (1-based) + - `line` - text of the line + """ + + includes: list[CodeIncludeInfo] = [] + for line_no, line in enumerate(lines, start=1): + if CODE_INCLUDE_RE.match(line): + includes.append(CodeIncludeInfo(line_no=line_no, line=line)) + return includes + + +def replace_code_includes_with_placeholders(text: list[str]) -> list[str]: + """ + Replace code includes with placeholders. + """ + + includes = extract_code_includes(text) + for include in includes: + text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER + return text + + +def replace_placeholders_with_code_includes( + text: list[str], original_includes: list[CodeIncludeInfo] +) -> list[str]: + """ + Replace code includes placeholders with actual code includes from the original (English) document. + Fail if the number of placeholders does not match the number of original includes. + """ + + modified_text: list[str] = [] + include_index = 0 + for line in text: + if line.strip() == CODE_INCLUDE_PLACEHOLDER: + if include_index >= len(original_includes): + raise ValueError( + "Number of placeholders exceeds number of code includes in the original document" + ) + modified_text.append(original_includes[include_index]["line"]) + include_index += 1 + else: + modified_text.append(line) + + if include_index < len(original_includes): + raise ValueError( + "Number of placeholders is less than number of code includes in the original document" + ) + + return modified_text + + +# Header permalinks +# ----------------------------------------------------------------------------------------- + + +def extract_header_permalinks(lines: list[str]) -> list[HeaderPermalinkInfo]: + """ + Extract list of header permalinks from the given lines. + + Return list of HeaderPermalinkInfo namedtuples, where each tuple contains: + - `line_no` - line number (1-based) + - `hashes` - string of hashes representing header level (e.g., "###") + - `permalink` - permalink string (e.g., "{#permalink}") + """ + + headers: list[HeaderPermalinkInfo] = [] + in_code_block3 = False + in_code_block4 = False + + for line_no, line in enumerate(lines, start=1): + if not (in_code_block3 or in_code_block4): + if line.startswith("```"): + count = len(line) - len(line.lstrip("`")) + if count == 3: + in_code_block3 = True + continue + elif count >= 4: + in_code_block4 = True + continue + + header_match = HEADER_WITH_PERMALINK_RE.match(line) + if header_match: + hashes, _title, permalink = header_match.groups() + headers.append( + HeaderPermalinkInfo( + hashes=hashes, line_no=line_no, permalink=permalink + ) + ) + + elif in_code_block3: + if line.startswith("```"): + count = len(line) - len(line.lstrip("`")) + if count == 3: + in_code_block3 = False + continue + + elif in_code_block4: + if line.startswith("````"): + count = len(line) - len(line.lstrip("`")) + if count >= 4: + in_code_block4 = False + continue + + return headers + + +def remove_header_permalinks(lines: list[str]) -> list[str]: + """ + Remove permalinks from headers in the given lines. + """ + + modified_lines: list[str] = [] + for line in lines: + header_match = HEADER_WITH_PERMALINK_RE.match(line) + if header_match: + hashes, title, _permalink = header_match.groups() + modified_line = f"{hashes} {title}" + modified_lines.append(modified_line) + else: + modified_lines.append(line) + return modified_lines + + +def replace_header_permalinks( + text: list[str], original_permalinks: list[HeaderPermalinkInfo] +) -> list[str]: + """ + Replace permalinks in the given text with the permalinks from the original document. + + Fail if the number or order of headers does not match the original. + """ + + modified_text: list[str] = [] + permalink_index = 0 + for line in text: + header_match = HEADER_LINE_RE.match(line) + if header_match: + if permalink_index >= len(original_permalinks): + raise ValueError( + "Number of headers exceeds number of headers in the original document" + ) + hashes, title, _permalink = header_match.groups() + original_permalink_info = original_permalinks[permalink_index] + if original_permalink_info["hashes"] != hashes: + raise ValueError( + "Header levels do not match between document and original document" + ) + + modified_line = f"{hashes} {title}{original_permalink_info['permalink']}" + modified_text.append(modified_line) + permalink_index += 1 + else: + modified_text.append(line) + + if permalink_index < len(original_permalinks): + raise ValueError( + "Number of headers is less than number of headers in the original document" + ) + + return modified_text + + +# Markdown links +# ----------------------------------------------------------------------------------------- + + +def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]: + """ + Extract all markdown links from the given lines. + + Return list of MarkdownLinkInfo namedtuples, where each tuple contains: + - `line_no` - line number (1-based) + - `url` - link URL + - `text` - link text + - `title` - link title (if any) + """ + + links: list[MarkdownLinkInfo] = [] + for line_no, line in enumerate(lines, start=1): + for m in MARKDOWN_LINK_RE.finditer(line): + links.append( + MarkdownLinkInfo( + line_no=line_no, + url=m.group("url"), + text=m.group("text"), + title=m.group("title"), + attributes=m.group("attrs"), + ) + ) + return links + + +def _construct_markdown_link( + url: str, text: str, title: str | None, attributes: str | None, lang_code: str +) -> str: + """ + Construct a markdown link, adjusting the URL for the given language code if needed. + """ + + if url.startswith(TIANGOLO_COM): + url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") + + if title: + link = f'[{text}]({url} "{title}")' + else: + link = f"[{text}]({url})" + + if attributes: + link += f" {{{attributes}}}" + + return link + + +def replace_markdown_links( + text: list[str], original_links: list[MarkdownLinkInfo], lang_code: str +) -> list[str]: + """ + Replace markdown links in the given text with the original links. + + Fail if the number of links does not match the original. + """ + + modified_text: list[str] = [] + link_index = 0 + for line in text: + modified_line = line + for m in MARKDOWN_LINK_RE.finditer(line): + if link_index >= len(original_links): + raise ValueError( + "Number of markdown links exceeds number of markdown links in the original document" + ) + link_text = m.group("text") + assert isinstance(link_text, str) + link_title = m.group("title") + assert link_title is None or isinstance(link_title, str) + + original_link_info = original_links[link_index] + + # Replace + replacement_link = _construct_markdown_link( + url=original_link_info["url"], + text=link_text, + title=link_title, + attributes=original_link_info["attributes"], + lang_code=lang_code, + ) + modified_line = modified_line.replace(m.group(0), replacement_link, 1) + + link_index += 1 + modified_text.append(modified_line) + + if link_index < len(original_links): + raise ValueError( + "Number of markdown links is less than in the original document" + ) + + return modified_text + + +# HTML links +# ----------------------------------------------------------------------------------------- + + +def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]: + """ + Extract all HTML links from the given lines. + + Return list of HtmlLinkInfo namedtuples, where each tuple contains: + - `line_no` - line number (1-based) + - `full_tag` - full HTML link tag + - `attributes` - list of HTMLLinkAttribute namedtuples (name, quote, value) + - `text` - link text + """ + + links = [] + for line_no, line in enumerate(lines, start=1): + for html_link in HTML_LINK_RE.finditer(line): + link_str = html_link.group(0) + + link_text_match = HTML_LINK_TEXT.match(link_str) + assert link_text_match is not None + link_text = link_text_match.group(2) + assert isinstance(link_text, str) + + link_open_tag_match = HTML_LINK_OPEN_TAG_RE.match(link_str) + assert link_open_tag_match is not None + link_open_tag = link_open_tag_match.group(1) + assert isinstance(link_open_tag, str) + + attributes: list[HTMLLinkAttribute] = [] + for attr_name, attr_quote, attr_value in re.findall( + HTML_ATTR_RE, link_open_tag + ): + assert isinstance(attr_name, str) + assert isinstance(attr_quote, str) + assert isinstance(attr_value, str) + attributes.append( + HTMLLinkAttribute( + name=attr_name, quote=attr_quote, value=attr_value + ) + ) + links.append( + HtmlLinkInfo( + line_no=line_no, + full_tag=link_str, + attributes=attributes, + text=link_text, + ) + ) + return links + + +def _construct_html_link( + link_text: str, + attributes: list[HTMLLinkAttribute], + lang_code: str, +) -> str: + """ + Reconstruct HTML link, adjusting the URL for the given language code if needed. + """ + + attributes_upd: list[HTMLLinkAttribute] = [] + for attribute in attributes: + if attribute["name"] == "href": + original_url = attribute["value"] + if original_url.startswith(TIANGOLO_COM): + url = original_url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") + else: + url = original_url + attributes_upd.append( + HTMLLinkAttribute(name="href", quote=attribute["quote"], value=url) + ) + else: + attributes_upd.append(attribute) + + attrs_str = " ".join( + f"{attribute['name']}={attribute['quote']}{attribute['value']}{attribute['quote']}" + for attribute in attributes_upd + ) + return f"<a {attrs_str}>{link_text}</a>" + + +def replace_html_links( + text: list[str], original_links: list[HtmlLinkInfo], lang_code: str +) -> list[str]: + """ + Replace HTML links in the given text with the links from the original document. + + Adjust URLs for the given language code. + Fail if the number of links does not match the original. + """ + + links = extract_html_links(text) + if len(links) > len(original_links): + raise ValueError( + "Number of HTML links exceeds number of HTML links in the original document" + ) + elif len(links) < len(original_links): + raise ValueError("Number of HTML links is less than in the original document") + + modified_text = text.copy() + for link_index, link in enumerate(links): + original_link_info = original_links[link_index] + + # Replace in the document text + replacement_link = _construct_html_link( + link_text=link["text"], + attributes=original_link_info["attributes"], + lang_code=lang_code, + ) + line_no = link["line_no"] - 1 + modified_text[line_no] = modified_text[line_no].replace( + link["full_tag"], replacement_link, 1 + ) + + return modified_text diff --git a/scripts/translation_fixer.py b/scripts/translation_fixer.py new file mode 100644 index 0000000000..c1f036dcc8 --- /dev/null +++ b/scripts/translation_fixer.py @@ -0,0 +1,86 @@ +from pathlib import Path +from typing import Annotated + +import typer + +from scripts.doc_parsing_utils import ( + extract_code_includes, + extract_header_permalinks, + extract_html_links, + extract_markdown_links, + replace_code_includes_with_placeholders, + replace_header_permalinks, + replace_html_links, + replace_markdown_links, + replace_placeholders_with_code_includes, +) + +cli = typer.Typer() + + +@cli.callback() +def callback(): + pass + + +@cli.command() +def fix_pages( + doc_paths: Annotated[ + list[Path], + typer.Argument(help="List of paths to documents."), + ], +): + for path in doc_paths: + lang_code = path.parts[1] + if lang_code == "en": + print(f"Skipping English document: {path}") + continue + + en_doc_path = Path("docs") / "en" / Path(*path.parts[2:]) + + doc_lines = path.read_text(encoding="utf-8").splitlines() + en_doc_lines = en_doc_path.read_text(encoding="utf-8").splitlines() + + # Fix code includes + en_code_includes = extract_code_includes(en_doc_lines) + doc_lines_with_placeholders = replace_code_includes_with_placeholders(doc_lines) + fixed_doc_lines = replace_placeholders_with_code_includes( + doc_lines_with_placeholders, en_code_includes + ) + if fixed_doc_lines != doc_lines: + print(f"Fixing code includes in: {path}") + doc_lines = fixed_doc_lines + + # Fix permalinks + en_permalinks = extract_header_permalinks(en_doc_lines) + fixed_doc_lines = replace_header_permalinks(doc_lines, en_permalinks) + if fixed_doc_lines != doc_lines: + print(f"Fixing header permalinks in: {path}") + doc_lines = fixed_doc_lines + + # Fix markdown links + en_markdown_links = extract_markdown_links(en_doc_lines) + fixed_doc_lines = replace_markdown_links( + doc_lines, en_markdown_links, lang_code + ) + if fixed_doc_lines != doc_lines: + print(f"Fixing markdown links in: {path}") + doc_lines = fixed_doc_lines + + # Fix HTML links + en_html_links = extract_html_links(en_doc_lines) + fixed_doc_lines = replace_html_links(doc_lines, en_html_links, lang_code) + if fixed_doc_lines != doc_lines: + print(f"Fixing HTML links in: {path}") + doc_lines = fixed_doc_lines + + # Fix multiline code blocks + # TODO: Implement + + # Write back the fixed document + doc_lines.append("") # Ensure file ends with a newline + path.write_text("\n".join(doc_lines), encoding="utf-8") + + +if __name__ == "__main__": + cli()