Handle code blocks, fix some bugs, add `fix-all` command

2025-12-30 17:44:57 +01:00 · 2025-12-30 17:44:57 +01:00 · beff498743
parent 0339277673
commit beff498743
2 changed files with 279 additions and 6 deletions
--- a/scripts/doc_parsing_utils.py
+++ b/scripts/doc_parsing_utils.py
@ -20,10 +20,15 @@ MARKDOWN_LINK_RE = re.compile(
 )
 HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>")
-HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>")
+HTML_LINK_TEXT_RE = re.compile(r"<a\b([^>]*)>(.*?)</a>")
 HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>")
 HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')
 CODE_BLOCK_LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE)
 SLASHES_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*// .*)?$")
 HASH_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*# .*)?$")
 class CodeIncludeInfo(TypedDict):
    line_no: int
@ -57,6 +62,12 @@ class HtmlLinkInfo(TypedDict):
    text: str
 class MultilineCodeBlockInfo(TypedDict):
    lang: str
    start_line_no: int
    content: list[str]
 # Code includes
 # -----------------------------------------------------------------------------------------
@ -82,10 +93,11 @@ def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
    Replace code includes with placeholders.
    """
    modified_text = text.copy()
    includes = extract_code_includes(text)
    for include in includes:
-        text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
+        modified_text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
-    return text
+    return modified_text
 def replace_placeholders_with_code_includes(
@ -274,7 +286,7 @@ def _construct_markdown_link(
        link = f"[{text}]({url})"
    if attributes:
-        link += f" {{{attributes}}}"
+        link += f"{{{attributes}}}"
    return link
@ -345,7 +357,7 @@ def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
        for html_link in HTML_LINK_RE.finditer(line):
            link_str = html_link.group(0)
-            link_text_match = HTML_LINK_TEXT.match(link_str)
+            link_text_match = HTML_LINK_TEXT_RE.match(link_str)
            assert link_text_match is not None
            link_text = link_text_match.group(2)
            assert isinstance(link_text, str)
@ -442,3 +454,188 @@ def replace_html_links(
        )
    return modified_text
 # Multiline code blocks
 # -----------------------------------------------------------------------------------------
 def get_code_block_lang(line: str) -> str:
    match = CODE_BLOCK_LANG_RE.match(line)
    if match:
        return match.group(1)
    return ""
 def extract_multiline_code_blocks(text: list[str]) -> list[MultilineCodeBlockInfo]:
    blocks: list[MultilineCodeBlockInfo] = []
    in_code_block3 = False
    in_code_block4 = False
    current_block_lang = ""
    current_block_start_line = -1
    current_block_lines = []
    for line_no, line in enumerate(text, start=1):
        stripped = line.lstrip()
        # --- Detect opening fence ---
        if not (in_code_block3 or in_code_block4):
            if stripped.startswith("```"):
                current_block_start_line = line_no
                count = len(stripped) - len(stripped.lstrip("`"))
                if count == 3:
                    in_code_block3 = True
                    current_block_lang = get_code_block_lang(stripped)
                    current_block_lines = [line]
                    continue
                elif count >= 4:
                    in_code_block4 = True
                    current_block_lang = get_code_block_lang(stripped)
                    current_block_lines = [line]
                    continue
        # --- Detect closing fence ---
        elif in_code_block3:
            if stripped.startswith("```"):
                count = len(stripped) - len(stripped.lstrip("`"))
                if count == 3:
                    current_block_lines.append(line)
                    blocks.append(
                        MultilineCodeBlockInfo(
                            lang=current_block_lang,
                            start_line_no=current_block_start_line,
                            content=current_block_lines,
                        )
                    )
                    in_code_block3 = False
                    current_block_lang = ""
                    current_block_start_line = -1
                    current_block_lines = []
                    continue
            current_block_lines.append(line)
        elif in_code_block4:
            if stripped.startswith("````"):
                count = len(stripped) - len(stripped.lstrip("`"))
                if count >= 4:
                    current_block_lines.append(line)
                    blocks.append(
                        MultilineCodeBlockInfo(
                            lang=current_block_lang,
                            start_line_no=current_block_start_line,
                            content=current_block_lines,
                        )
                    )
                    in_code_block4 = False
                    current_block_lang = ""
                    current_block_start_line = -1
                    current_block_lines = []
                    continue
            current_block_lines.append(line)
    return blocks
 def _split_hash_comment(line: str) -> tuple[str, str | None]:
    match = HASH_COMMENT_RE.match(line)
    if match:
        code = match.group("code").rstrip()
        comment = match.group("comment")
        return code, comment
    return line.rstrip(), None
 def _split_slashes_comment(line: str) -> tuple[str, str | None]:
    match = SLASHES_COMMENT_RE.match(line)
    if match:
        code = match.group("code").rstrip()
        comment = match.group("comment")
        return code, comment
    return line, None
 def replace_multiline_code_block(
    block_a: MultilineCodeBlockInfo, block_b: MultilineCodeBlockInfo
 ) -> list[str]:
    """
    Replace multiline code block a with block b leaving comments intact.
    Syntax of comments depends on the language of the code block.
    Raises ValueError if the blocks are not compatible (different languages or different number of lines).
    """
    if block_a["lang"] != block_b["lang"]:
        raise ValueError("Code blocks have different languages")
    if len(block_a["content"]) != len(block_b["content"]):
        raise ValueError("Code blocks have different number of lines")
    block_language = block_a["lang"].lower()
    if block_language in {"mermaid"}:
        return block_a["content"].copy()  # We don't handle mermaid code blocks for now
    code_block: list[str] = []
    for line_a, line_b in zip(block_a["content"], block_b["content"]):
        line_a_comment: str | None = None
        line_b_comment: str | None = None
        # Handle comments based on language
        if block_language in {
            "python",
            "py",
            "sh",
            "bash",
            "dockerfile",
            "requirements",
            "gitignore",
            "toml",
            "yaml",
            "yml",
        }:
            _line_a_code, line_a_comment = _split_hash_comment(line_a)
            line_b_code, line_b_comment = _split_hash_comment(line_b)
            res_line = line_b
            if line_b_comment:
                res_line = res_line.replace(line_b_comment, line_a_comment, 1)
            code_block.append(res_line)
        elif block_language in {"console", "json"}:
            _line_a_code, line_a_comment = _split_slashes_comment(line_a)
            line_b_code, line_b_comment = _split_slashes_comment(line_b)
            res_line = line_b
            if line_b_comment:
                print(f"Replacing comment: {line_b_comment} with {line_a_comment}")
                res_line = res_line.replace(line_b_comment, line_a_comment, 1)
                print(f"Resulting line: {res_line}")
            code_block.append(res_line)
        else:
            code_block.append(line_b)
    return code_block
 def replace_multiline_code_blocks_in_text(
    text: list[str],
    code_blocks: list[MultilineCodeBlockInfo],
    original_code_blocks: list[MultilineCodeBlockInfo],
 ) -> list[MultilineCodeBlockInfo]:
    """
    Update each code block in `text` with the corresponding code block from
    `original_code_blocks` with comments taken from `code_blocks`.
    Raises ValueError if the number, language, or shape of code blocks do not match.
    """
    if len(code_blocks) != len(original_code_blocks):
        raise ValueError(
            "Number of code blocks does not match the number of original code blocks"
        )
    modified_text = text.copy()
    for block, original_block in zip(code_blocks, original_code_blocks):
        updated_content = replace_multiline_code_block(block, original_block)
        start_line_index = block["start_line_no"] - 1
        for i, updated_line in enumerate(updated_content):
            modified_text[start_line_index + i] = updated_line
    return modified_text
--- a/scripts/translation_fixer.py
+++ b/scripts/translation_fixer.py
@ -1,3 +1,6 @@
 import difflib
 import os
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Annotated
@ -8,13 +11,27 @@ from scripts.doc_parsing_utils import (
    extract_header_permalinks,
    extract_html_links,
    extract_markdown_links,
    extract_multiline_code_blocks,
    replace_code_includes_with_placeholders,
    replace_header_permalinks,
    replace_html_links,
    replace_markdown_links,
    replace_multiline_code_blocks_in_text,
    replace_placeholders_with_code_includes,
 )
 non_translated_sections = (
    f"reference{os.sep}",
    "release-notes.md",
    "fastapi-people.md",
    "external-links.md",
    "newsletter.md",
    "management-tasks.md",
    "management.md",
    "contributing.md",
 )
 cli = typer.Typer()
@ -23,6 +40,53 @@ def callback():
    pass
 def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]:
    """
    Iterate on the markdown files to translate in order of priority.
    """
    first_dirs = [
        lang_path_root / "learn",
        lang_path_root / "tutorial",
        lang_path_root / "advanced",
        lang_path_root / "about",
        lang_path_root / "how-to",
    ]
    first_parent = lang_path_root
    yield from first_parent.glob("*.md")
    for dir_path in first_dirs:
        yield from dir_path.rglob("*.md")
    first_dirs_str = tuple(str(d) for d in first_dirs)
    for path in lang_path_root.rglob("*.md"):
        if str(path).startswith(first_dirs_str):
            continue
        if path.parent == first_parent:
            continue
        yield path
 def get_all_paths(lang: str):
    res: list[str] = []
    lang_docs_root = Path("docs") / lang / "docs"
    for path in iter_all_lang_paths(lang_docs_root):
        relpath = path.relative_to(lang_docs_root)
        if not str(relpath).startswith(non_translated_sections):
            res.append(str(relpath))
    return res
@cli.command()
 def fix_all(ctx: typer.Context, language: str):
    docs = get_all_paths(language)
    for page in docs:
        doc_path = Path("docs") / language / "docs" / page
        try:
            fix_pages(doc_paths=[doc_path])
        except ValueError as e:
            print(f"Error processing {doc_path}: {e}")
@cli.command()
 def fix_pages(
    doc_paths: Annotated[
@ -49,6 +113,11 @@ def fix_pages(
        )
        if fixed_doc_lines != doc_lines:
            print(f"Fixing code includes in: {path}")
            diff = difflib.unified_diff(
                doc_lines, fixed_doc_lines, fromfile="translation", tofile="fixed"
            )
            print("\n".join(diff))
        doc_lines = fixed_doc_lines
        # Fix permalinks
@ -75,7 +144,14 @@ def fix_pages(
        doc_lines = fixed_doc_lines
        # Fix multiline code blocks
-        # TODO: Implement
+        en_code_blocks = extract_multiline_code_blocks(en_doc_lines)
        doc_code_blocks = extract_multiline_code_blocks(doc_lines)
        fixed_doc_lines = replace_multiline_code_blocks_in_text(
            doc_lines, doc_code_blocks, en_code_blocks
        )
        if fixed_doc_lines != doc_lines:
            print(f"Fixing multiline code blocks in: {path}")
        doc_lines = fixed_doc_lines
        # Write back the fixed document
        doc_lines.append("")  # Ensure file ends with a newline