diff --git a/scripts/cmpr.py b/scripts/cmpr.py deleted file mode 100644 index f25240175b..0000000000 --- a/scripts/cmpr.py +++ /dev/null @@ -1,694 +0,0 @@ -import os -import platform -import re -import subprocess -from collections.abc import Iterable -from dataclasses import dataclass -from pathlib import Path -from typing import Annotated, Literal, cast - -import typer - -ROOT = Path("../") # assuming this script is in the scripts directory -DOCS_ROOT = os.getenv("DOCS_ROOT", "docs") -TMP_DOCS_PATH = os.getenv("TMP_DOCS_PATH", "non-git/translations") -VSCODE_COMMAND = os.getenv( - "VSCODE_COMMAND", "code.cmd" if platform.system() == "Windows" else "code" -) - -# TBD: `Literal` is not supported in typer 0.16.0, which is the -# version given in the requirements-docs.txt. -# Shall we upgrade that requirement to 0.20.0? -LANGS = Literal["es", "de", "ru", "pt", "uk", "fr", "ja", "ko"] - - -non_translated_sections = ( - f"reference{os.sep}", - "release-notes.md", - "fastapi-people.md", - "external-links.md", - "newsletter.md", - "management-tasks.md", - "management.md", - "contributing.md", -) - - -class Retry(Exception): - pass - - -class CompareError(Exception): - pass - - -@dataclass -class Config: - lang: LANGS - interactive: bool = True - check_code_includes: bool = True - check_multiline_blocks: bool = True - check_headers_and_permalinks: bool = True - check_markdown_links: bool = True - check_html_links: bool = True - full_paths: bool = False - - -# =================================================================================== -# Code includes - -CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$") - - -def extract_code_includes(lines: list[str]) -> list[tuple[str, str, str, int]]: - includes = [] - for line_no, line in enumerate(lines, start=1): - if CODE_INCLUDE_RE.match(line): - includes.append((line_no, line)) - return includes - - -def replace_code_includes(source_text: str, target_text: str) -> str: - target_lines = target_text.splitlines() - source_code_includes = extract_code_includes(source_text.splitlines()) - target_code_includes = extract_code_includes(target_lines) - - if len(source_code_includes) != len(target_code_includes): - raise CompareError( - f"Number of code includes differs: " - f"{len(source_code_includes)} in source vs {len(target_code_includes)} in target." - ) - - for src_include, tgt_include in zip(source_code_includes, target_code_includes): - _, src_line = src_include - tgt_line_no, _ = tgt_include - target_lines[tgt_line_no - 1] = src_line - - target_lines.append("") # To preserve the empty line in the end of the file - return "\n".join(target_lines) - - -# =================================================================================== -# Multiline code blocks - -LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE) - - -def get_code_block_lang(line: str) -> str: - match = LANG_RE.match(line) - if match: - return match.group(1) - return "" - - -def extract_multiline_blocks(text: str) -> list[tuple[str, int, str]]: - lines = text.splitlines() - blocks = [] - - in_code_block3 = False - in_code_block4 = False - current_block_lang = "" - current_block_start_line = -1 - current_block_lines = [] - - for line_no, line in enumerate(lines, start=1): - stripped = line.lstrip() - - # --- Detect opening fence --- - if not (in_code_block3 or in_code_block4): - if stripped.startswith("```"): - current_block_start_line = line_no - count = len(stripped) - len(stripped.lstrip("`")) - if count == 3: - in_code_block3 = True - current_block_lang = get_code_block_lang(stripped) - current_block_lines = [line] - continue - elif count >= 4: - in_code_block4 = True - current_block_lang = get_code_block_lang(stripped) - current_block_lines = [line] - continue - - # --- Detect closing fence --- - elif in_code_block3: - if stripped.startswith("```"): - count = len(stripped) - len(stripped.lstrip("`")) - if count == 3: - current_block_lines.append(line) - blocks.append( - ( - current_block_lang, - current_block_start_line, - "\n".join(current_block_lines), - ) - ) - in_code_block3 = False - current_block_lang = "" - current_block_start_line = -1 - continue - current_block_lines.append(line) - - elif in_code_block4: - if stripped.startswith("````"): - count = len(stripped) - len(stripped.lstrip("`")) - if count >= 4: - current_block_lines.append(line) - blocks.append( - ( - current_block_lang, - current_block_start_line, - "\n".join(current_block_lines), - ) - ) - in_code_block4 = False - current_block_lang = "" - current_block_start_line = -1 - continue - current_block_lines.append(line) - - return blocks - - -def replace_blocks(source_text: str, target_text: str) -> str: - source_blocks = extract_multiline_blocks(source_text) - target_blocks = extract_multiline_blocks(target_text) - - if len(source_blocks) != len(target_blocks): - raise CompareError( - f"Number of code blocks differs: " - f"{len(source_blocks)} in source vs {len(target_blocks)} in target." - ) - - for i, ((src_lang, *_), (tgt_lang, tgt_line_no, *_)) in enumerate( - zip(source_blocks, target_blocks), 1 - ): - if src_lang != tgt_lang: - raise CompareError( - f"Type mismatch in block #{i} (line {tgt_line_no}): " - f"'{src_lang or '(no lang)'}' vs '{tgt_lang or '(no lang)'}'" - ) - - # Sequentially replace each block in target with the one from source - result = target_text - for (*_, src_block), (*_, tgt_block) in zip(source_blocks, target_blocks): - result = result.replace(tgt_block, src_block, 1) - - return result - - -# =================================================================================== -# Headers and permalinks - -header_with_permalink_pattern = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$") - - -def extract_headers_and_permalinks(lines: list[str]) -> list[tuple[str, int, str]]: - headers = [] - in_code_block3 = False - in_code_block4 = False - - for line_no, line in enumerate(lines, start=1): - if not (in_code_block3 or in_code_block4): - if line.startswith("```"): - count = len(line) - len(line.lstrip("`")) - if count == 3: - in_code_block3 = True - continue - elif count >= 4: - in_code_block4 = True - continue - - header_match = header_with_permalink_pattern.match(line) - if header_match: - hashes, _title, permalink = header_match.groups() - headers.append((hashes, line_no, permalink)) - - elif in_code_block3: - if line.startswith("```"): - count = len(line) - len(line.lstrip("`")) - if count == 3: - in_code_block3 = False - continue - - elif in_code_block4: - if line.startswith("````"): - count = len(line) - len(line.lstrip("`")) - if count >= 4: - in_code_block4 = False - continue - - return headers - - -def replace_headers_and_permalinks(source_text: str, target_text: str) -> str: - target_lines = target_text.splitlines() - - source_headers = extract_headers_and_permalinks(source_text.splitlines()) - target_headers = extract_headers_and_permalinks(target_lines) - - if len(source_headers) != len(target_headers): - raise CompareError( - f"Number of headers differs: " - f"{len(source_headers)} in source vs {len(target_headers)} in target." - ) - - for i, ((src_hashes, *_), (tgt_hashes, tgt_line_no, *_)) in enumerate( - zip(source_headers, target_headers), 1 - ): - if src_hashes != tgt_hashes: - raise CompareError( - f"Header level mismatch in #{i} (line {tgt_line_no}): " - "'{src_hashes}' vs '{tgt_hashes}'" - ) - - # Sequentially replace each header permalink in target with the one from source - for src_header, tgt_header in zip(source_headers, target_headers): - src_permalink = src_header[2] - tgt_line_no = tgt_header[1] - 1 # Convert from 1-based to 0-based - header_match = header_with_permalink_pattern.match(target_lines[tgt_line_no]) - if header_match: - hashes, title, _ = header_match.groups() - target_lines[tgt_line_no] = ( - f"{hashes} {title}{src_permalink or ' (ERROR - MISSING PERMALINK)'}" - ) - - target_lines.append("") # To preserve the empty line in the end of the file - return "\n".join(target_lines) - - -# =================================================================================== -# Links - -MARKDOWN_LINK_RE = re.compile( - r"(?.*?)\]" # link text (non-greedy) - r"\(" - r"(?P\S+?)" # url (no spaces, non-greedy) - r'(?:\s+["\'](?P.*?)["\'])?' # optional title in "" or '' - r"\)" -) - - -def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]: - links = [] - for line_no, line in enumerate(lines, start=1): - for m in MARKDOWN_LINK_RE.finditer(line): - url = m.group("url") - links.append((url, line_no)) - return links - - -def replace_markdown_links(source_text: str, target_text: str, lang: str) -> str: - target_lines = target_text.splitlines() - source_links = extract_markdown_links(source_text.splitlines()) - target_links = extract_markdown_links(target_lines) - - if len(source_links) != len(target_links): - raise CompareError( - f"Number of markdown links differs: " - f"{len(source_links)} in source vs {len(target_links)} in target." - ) - - # Sequentially replace each link URL in target with the one from source - for (src_link, _), (tgt_link, tgt_line_no) in zip(source_links, target_links): - real_line_no = tgt_line_no - 1 # Convert to zero-based - line = target_lines[real_line_no] - link_replace = add_lang_code_if_needed(src_link, tgt_link, lang) - target_lines[real_line_no] = line.replace(tgt_link, link_replace) - - target_lines.append("") # To preserve the empty line in the end of the file - return "\n".join(target_lines) - - -HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>") -HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>") -HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>") -HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2') - - -def extract_html_links( - lines: list[str], -) -> list[tuple[tuple[str, list[tuple[str, str, str]], str], int]]: - links = [] - for line_no, line in enumerate(lines, start=1): - for html_link in HTML_LINK_RE.finditer(line): - link_str = html_link.group(0) - link_text = cast(re.Match, HTML_LINK_TEXT.match(link_str)).group(2) - link_data = (link_str, [], link_text) - link_open_tag = cast(re.Match, HTML_LINK_OPEN_TAG_RE.match(link_str)).group( - 1 - ) - attributes = re.findall(HTML_ATTR_RE, link_open_tag) - for attr_data in attributes: - link_data[1].append(attr_data) - links.append((link_data, line_no)) - return links - - -TIANGOLO_COM = "https://fastapi.tiangolo.com" - - -def add_lang_code_if_needed(url: str, prev_url: str, lang_code: str) -> str: - if url.startswith(TIANGOLO_COM): - if prev_url.startswith(f"{TIANGOLO_COM}/{lang_code}"): - url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") - return url - - -def reconstruct_html_link( - attributes: list[tuple[str, str, str]], - link_text: str, - prev_attributes: list[tuple[str, str, str]], - lang_code: str, -) -> str: - prev_attributes_dict = {attr[0]: attr[2] for attr in prev_attributes} - prev_url = prev_attributes_dict["href"] - attributes_upd = [] - for attr_name, attr_quotes, attr_value in attributes: - if attr_name == "href": - attr_value = add_lang_code_if_needed(attr_value, prev_url, lang_code) - attributes_upd.append((attr_name, attr_quotes, attr_value)) - - attrs_str = " ".join( - f"{name}={quetes}{value}{quetes}" for name, quetes, value in attributes_upd - ) - return f"<a {attrs_str}>{link_text}</a>" - - -def replace_html_links(source_text: str, target_text: str, lang: str) -> str: - target_lines = target_text.splitlines() - source_links = extract_html_links(source_text.splitlines()) - target_links = extract_html_links(target_lines) - - if len(source_links) != len(target_links): - raise CompareError( - f"Number of HTML links differs: " - f"{len(source_links)} in source vs {len(target_links)} in target." - ) - - # Sequentially replace attributes of each link URL in target with the one from source - for (src_link_data, _), (tgt_link_data, tgt_line_no) in zip( - source_links, target_links - ): - real_line_no = tgt_line_no - 1 # Convert to zero-based - line = target_lines[real_line_no] - tgt_link_text = tgt_link_data[2] - - tgt_link_original = tgt_link_data[0] - tgt_link_override = reconstruct_html_link( - src_link_data[1], tgt_link_text, tgt_link_data[1], lang - ) - target_lines[real_line_no] = line.replace(tgt_link_original, tgt_link_override) - - target_lines.append("") # To preserve the empty line in the end of the file - return "\n".join(target_lines) - - -# =================================================================================== -# Images - - -# =================================================================================== -# Helper functions - - -def get_lang_doc_root_dir(lang: str) -> Path: - return ROOT / DOCS_ROOT / lang / "docs" - - -def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]: - """ - Iterate on the markdown files to translate in order of priority. - """ - - first_dirs = [ - lang_path_root / "learn", - lang_path_root / "tutorial", - lang_path_root / "advanced", - lang_path_root / "about", - lang_path_root / "how-to", - ] - first_parent = lang_path_root - yield from first_parent.glob("*.md") - for dir_path in first_dirs: - yield from dir_path.rglob("*.md") - first_dirs_str = tuple(str(d) for d in first_dirs) - for path in lang_path_root.rglob("*.md"): - if str(path).startswith(first_dirs_str): - continue - if path.parent == first_parent: - continue - yield path - - -def get_all_paths(lang: str): - res: list[str] = [] - lang_docs_root = get_lang_doc_root_dir(lang) - for path in iter_all_lang_paths(lang_docs_root): - relpath = path.relative_to(lang_docs_root) - if not str(relpath).startswith(non_translated_sections): - res.append(str(relpath)) - return res - - -# =================================================================================== -# Main - - -def process_one_file_with_retry(document_path: str, config: Config) -> bool: - en_docs_root_path = Path(get_lang_doc_root_dir("en")) - lang_docs_root_path = Path(get_lang_doc_root_dir(config.lang)) - while True: - try: - return process_one_file( - en_docs_root_path / document_path, - lang_docs_root_path / document_path, - config=config, - ) - except Retry: # Retry is only raised in interactive mode - pass - - -def process_one_file( - en_doc_path_str: Path, lang_doc_path_str: Path, config: Config -) -> bool: - en_doc_path = Path(en_doc_path_str) - lang_doc_path = Path(lang_doc_path_str) - if not en_doc_path.exists(): - print( - f"{'❌🔎 ' if config.interactive else ''}{en_doc_path_str} - doesn't exist" - ) - return False - - en_doc_text = en_doc_path.read_text(encoding="utf-8") - lang_doc_text = lang_doc_path.read_text(encoding="utf-8") - lang_doc_text_orig = lang_doc_text - - try: - if config.check_code_includes: - lang_doc_text = replace_code_includes( - source_text=en_doc_text, - target_text=lang_doc_text, - ) - if config.check_multiline_blocks: - lang_doc_text = replace_blocks( - source_text=en_doc_text, - target_text=lang_doc_text, - ) - if config.check_headers_and_permalinks: - lang_doc_text = replace_headers_and_permalinks( - source_text=en_doc_text, - target_text=lang_doc_text, - ) - if config.check_markdown_links: - lang_doc_text = replace_markdown_links( - source_text=en_doc_text, - target_text=lang_doc_text, - lang=config.lang, - ) - if config.check_html_links: - lang_doc_text = replace_html_links( - source_text=en_doc_text, - target_text=lang_doc_text, - lang=config.lang, - ) - - except CompareError as e: - print(f"{'❔❌ ' if config.interactive else ''}{lang_doc_path_str} Error: {e}") - if not config.interactive: - return False - subprocess.run([VSCODE_COMMAND, "--diff", lang_doc_path_str, en_doc_path_str]) - resp = "" - while resp not in ("f", "e"): - resp = input( - " Check the diff, fix the problem, and then type F if it's fixed or E to mark as invalid and skip: " - ) - if resp.lower() == "e": - print(f"❌ {lang_doc_path_str} skipped with error") - return - print(f"Check {lang_doc_path_str} again") - raise Retry() from None - - if lang_doc_text_orig != lang_doc_text: - print( - f"{'❔🆚 ' if config.interactive else ''}{lang_doc_path_str} - non-empty diff" - ) - if not config.interactive: - return False - tmp_path = ROOT / TMP_DOCS_PATH / Path(lang_doc_path_str) - tmp_path.parent.mkdir(parents=True, exist_ok=True) - tmp_path.write_text(lang_doc_text, encoding="utf-8") - subprocess.run( - [VSCODE_COMMAND, "--diff", str(lang_doc_path_str), str(tmp_path)] - ) - resp = "" - while resp not in ("f", "e"): - resp = input( - " Check the diff, fix the problem, and then type F to mark it as fixed or E to to mark as invalid and skip: " - ).lower() - if resp == "e": - print(f"❌ {lang_doc_path_str} skipped with non-empty diff") - return - - print(f"{'✅ ' if config.interactive else ''}{lang_doc_path_str} - Ok") - return True - - -# =================================================================================== -# Typer app - -cli = typer.Typer() - - -@cli.callback() -def callback(): - pass - - -@cli.callback() -def main( - ctx: typer.Context, - lang: Annotated[LANGS, typer.Option()], - interactive: Annotated[ - bool, - typer.Option( - help="If True, will open VSCode diffs for each change to fix and confirm.", - ), - ] = True, - full_paths: Annotated[ - bool, - typer.Option( - help="If True, the provided document paths are treated as full paths.", - ), - ] = False, - check_code_includes: Annotated[ - bool, - typer.Option( - help="If True, will compare code includes blocks.", - ), - ] = True, - check_multiline_blocks: Annotated[ - bool, - typer.Option( - help="If True, will compare multiline code blocks.", - ), - ] = True, - check_headers_and_permalinks: Annotated[ - bool, - typer.Option( - help="If True, will compare headers and permalinks.", - ), - ] = True, - check_markdown_links: Annotated[ - bool, - typer.Option( - help="If True, will compare markdown links.", - ), - ] = True, - check_html_links: Annotated[ - bool, - typer.Option( - help="If True, will compare HTML links.", - ), - ] = True, -): - ctx.obj = Config( - lang=lang, - interactive=interactive, - full_paths=full_paths, - check_code_includes=check_code_includes, - check_multiline_blocks=check_multiline_blocks, - check_headers_and_permalinks=check_headers_and_permalinks, - check_markdown_links=check_markdown_links, - check_html_links=check_html_links, - ) - - -@cli.command() -def process_all( - ctx: typer.Context, -): - """ - Go through all documents of language and compare special blocks with the corresponding - blocks in English versions of those documents. - """ - config = cast(Config, ctx.obj) - lang_docs_root_path = get_lang_doc_root_dir(config.lang) - docs = get_all_paths(config.lang) - - all_good = True - pages_with_errors: list[str] = [] - for doc in docs: - res = process_one_file_with_retry(document_path=doc, config=config) - all_good = all_good and res - if not res: - pages_with_errors.append(doc) - - if not all_good: - print("Some documents had errors:") - docs_path = lang_docs_root_path.relative_to(ROOT) - for page in pages_with_errors: - print(f" - {docs_path / page}") - raise typer.Exit(code=1) - - -@cli.command() -def process_pages( - doc_paths: Annotated[ - list[str], - typer.Argument( - help="List of relative paths to the EN documents. Should be relative to docs/en/docs/", - ), - ], - ctx: typer.Context, -): - """ - Compare special blocks of specified EN documents with the corresponding blocks in - translated versions of those documents. - """ - - config = cast(Config, ctx.obj) - lang_docs_root_path = get_lang_doc_root_dir(config.lang) - - all_good = True - pages_with_errors: list[str] = [] - for doc_path in doc_paths: - if config.full_paths: - path = ROOT / doc_path.lstrip("/") - doc_path = str(path.relative_to(lang_docs_root_path)) - res = process_one_file_with_retry(document_path=doc_path, config=config) - all_good = all_good and res - if not res: - pages_with_errors.append(doc_path) - - if not all_good: - print("Some documents had errors:") - docs_path = lang_docs_root_path.relative_to(ROOT) - for page in pages_with_errors: - print(f" - {docs_path / page}") - raise typer.Exit(code=1) - - -if __name__ == "__main__": - cli()