diff --git a/scripts/cmpr.py b/scripts/cmpr.py new file mode 100644 index 000000000..31fc517d8 --- /dev/null +++ b/scripts/cmpr.py @@ -0,0 +1,690 @@ +import os +import platform +import re +import subprocess +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path +from typing import Annotated, Literal, cast + +import typer + +ROOT = Path("../") # assuming this script is in the scripts directory +DOCS_ROOT = os.getenv("DOCS_ROOT", "docs") +TMP_DOCS_PATH = os.getenv("TMP_DOCS_PATH", "non-git/translations") +VSCODE_COMMAND = os.getenv( + "VSCODE_COMMAND", "code.cmd" if platform.system() == "Windows" else "code" +) + +# TBD: `Literal` is not supported in typer 0.16.0, which is the +# version given in the requirements-docs.txt. +# Shall we upgrade that requirement to 0.20.0? +LANGS = Literal["es", "de", "ru", "pt", "uk", "fr"] + + +non_translated_sections = ( + f"reference{os.sep}", + "release-notes.md", + "fastapi-people.md", + "external-links.md", + "newsletter.md", + "management-tasks.md", + "management.md", + "contributing.md", +) + + +class Retry(Exception): + pass + + +class CompareError(Exception): + pass + + +@dataclass +class Config: + lang: LANGS + interactive: bool = True + check_code_includes: bool = True + check_multiline_blocks: bool = True + check_headers_and_permalinks: bool = True + check_markdown_links: bool = True + check_html_links: bool = True + full_paths: bool = False + + +# =================================================================================== +# Code includes + +CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$") + + +def extract_code_includes(lines: list[str]) -> list[tuple[str, str, str, int]]: + includes = [] + for line_no, line in enumerate(lines, start=1): + if CODE_INCLUDE_RE.match(line): + includes.append((line_no, line)) + return includes + + +def replace_code_includes(source_text: str, target_text: str) -> str: + target_lines = target_text.splitlines() + source_code_includes = extract_code_includes(source_text.splitlines()) + target_code_includes = extract_code_includes(target_lines) + + if len(source_code_includes) != len(target_code_includes): + raise CompareError( + f"Number of code includes differs: " + f"{len(source_code_includes)} in source vs {len(target_code_includes)} in target." + ) + + for src_include, tgt_include in zip(source_code_includes, target_code_includes): + _, src_line = src_include + tgt_line_no, _ = tgt_include + target_lines[tgt_line_no - 1] = src_line + + target_lines.append("") # To preserve the empty line in the end of the file + return "\n".join(target_lines) + + +# =================================================================================== +# Multiline code blocks + +LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE) + + +def get_code_block_lang(line: str) -> str: + match = LANG_RE.match(line) + if match: + return match.group(1) + return "" + + +def extract_multiline_blocks(text: str) -> list[tuple[str, int, str]]: + lines = text.splitlines() + blocks = [] + + in_code_block3 = False + in_code_block4 = False + current_block_lang = "" + current_block_start_line = -1 + current_block_lines = [] + + for line_no, line in enumerate(lines, start=1): + stripped = line.lstrip() + + # --- Detect opening fence --- + if not (in_code_block3 or in_code_block4): + if stripped.startswith("```"): + current_block_start_line = line_no + count = len(stripped) - len(stripped.lstrip("`")) + if count == 3: + in_code_block3 = True + current_block_lang = get_code_block_lang(stripped) + current_block_lines = [line] + continue + elif count >= 4: + in_code_block4 = True + current_block_lang = get_code_block_lang(stripped) + current_block_lines = [line] + continue + + # --- Detect closing fence --- + elif in_code_block3: + if stripped.startswith("```"): + count = len(stripped) - len(stripped.lstrip("`")) + if count == 3: + current_block_lines.append(line) + blocks.append( + ( + current_block_lang, + current_block_start_line, + "\n".join(current_block_lines), + ) + ) + in_code_block3 = False + current_block_lang = "" + current_block_start_line = -1 + continue + current_block_lines.append(line) + + elif in_code_block4: + if stripped.startswith("````"): + count = len(stripped) - len(stripped.lstrip("`")) + if count >= 4: + current_block_lines.append(line) + blocks.append( + ( + current_block_lang, + current_block_start_line, + "\n".join(current_block_lines), + ) + ) + in_code_block4 = False + current_block_lang = "" + current_block_start_line = -1 + continue + current_block_lines.append(line) + + return blocks + + +def replace_blocks(source_text: str, target_text: str) -> str: + source_blocks = extract_multiline_blocks(source_text) + target_blocks = extract_multiline_blocks(target_text) + + if len(source_blocks) != len(target_blocks): + raise CompareError( + f"Number of code blocks differs: " + f"{len(source_blocks)} in source vs {len(target_blocks)} in target." + ) + + for i, ((src_lang, *_), (tgt_lang, tgt_line_no, *_)) in enumerate( + zip(source_blocks, target_blocks), 1 + ): + if src_lang != tgt_lang: + raise CompareError( + f"Type mismatch in block #{i} (line {tgt_line_no}): " + f"'{src_lang or '(no lang)'}' vs '{tgt_lang or '(no lang)'}'" + ) + + # Sequentially replace each block in target with the one from source + result = target_text + for (*_, src_block), (*_, tgt_block) in zip(source_blocks, target_blocks): + result = result.replace(tgt_block, src_block, 1) + + return result + + +# =================================================================================== +# Headers and permalinks + +header_with_permalink_pattern = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$") + + +def extract_headers_and_permalinks(lines: list[str]) -> list[tuple[str, int, str]]: + headers = [] + in_code_block3 = False + in_code_block4 = False + + for line_no, line in enumerate(lines, start=1): + if not (in_code_block3 or in_code_block4): + if line.startswith("```"): + count = len(line) - len(line.lstrip("`")) + if count == 3: + in_code_block3 = True + continue + elif count >= 4: + in_code_block4 = True + continue + + header_match = header_with_permalink_pattern.match(line) + if header_match: + hashes, _title, permalink = header_match.groups() + headers.append((hashes, line_no, permalink)) + + elif in_code_block3: + if line.startswith("```"): + count = len(line) - len(line.lstrip("`")) + if count == 3: + in_code_block3 = False + continue + + elif in_code_block4: + if line.startswith("````"): + count = len(line) - len(line.lstrip("`")) + if count >= 4: + in_code_block4 = False + continue + + return headers + + +def replace_headers_and_permalinks(source_text: str, target_text: str) -> str: + target_lines = target_text.splitlines() + + source_headers = extract_headers_and_permalinks(source_text.splitlines()) + target_headers = extract_headers_and_permalinks(target_lines) + + if len(source_headers) != len(target_headers): + raise CompareError( + f"Number of headers differs: " + f"{len(source_headers)} in source vs {len(target_headers)} in target." + ) + + for i, ((src_hashes, *_), (tgt_hashes, tgt_line_no, *_)) in enumerate( + zip(source_headers, target_headers), 1 + ): + if src_hashes != tgt_hashes: + raise CompareError( + f"Header level mismatch in #{i} (line {tgt_line_no}): " + "'{src_hashes}' vs '{tgt_hashes}'" + ) + + # Sequentially replace each header permalink in target with the one from source + for src_header, tgt_header in zip(source_headers, target_headers): + src_permalink = src_header[2] + tgt_line_no = tgt_header[1] - 1 # Convert from 1-based to 0-based + header_match = header_with_permalink_pattern.match(target_lines[tgt_line_no]) + if header_match: + hashes, title, _ = header_match.groups() + target_lines[tgt_line_no] = ( + f"{hashes} {title}{src_permalink or ' (ERROR - MISSING PERMALINK)'}" + ) + + target_lines.append("") # To preserve the empty line in the end of the file + return "\n".join(target_lines) + + +# =================================================================================== +# Links + +MARKDOWN_LINK_RE = re.compile( + r"(?.*?)\]" # link text (non-greedy) + r"\(" + r"(?P\S+?)" # url (no spaces, non-greedy) + r'(?:\s+["\'](?P.*?)["\'])?' # optional title in "" or '' + r"\)" +) + + +def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]: + links = [] + for line_no, line in enumerate(lines, start=1): + for m in MARKDOWN_LINK_RE.finditer(line): + url = m.group("url") + links.append((url, line_no)) + return links + + +def replace_markdown_links(source_text: str, target_text: str, lang: str) -> str: + target_lines = target_text.splitlines() + source_links = extract_markdown_links(source_text.splitlines()) + target_links = extract_markdown_links(target_lines) + + if len(source_links) != len(target_links): + raise CompareError( + f"Number of markdown links differs: " + f"{len(source_links)} in source vs {len(target_links)} in target." + ) + + # Sequentially replace each link URL in target with the one from source + for (src_link, _), (tgt_link, tgt_line_no) in zip(source_links, target_links): + real_line_no = tgt_line_no - 1 # Convert to zero-based + line = target_lines[real_line_no] + link_replace = add_lang_code_if_needed(src_link, tgt_link, lang) + target_lines[real_line_no] = line.replace(tgt_link, link_replace) + + target_lines.append("") # To preserve the empty line in the end of the file + return "\n".join(target_lines) + + +HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>") +HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>") +HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>") +HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2') + + +def extract_html_links( + lines: list[str], +) -> list[tuple[tuple[str, list[tuple[str, str, str]], str], int]]: + links = [] + for line_no, line in enumerate(lines, start=1): + for html_link in HTML_LINK_RE.finditer(line): + link_str = html_link.group(0) + link_text = cast(re.Match, HTML_LINK_TEXT.match(link_str)).group(2) + link_data = (link_str, [], link_text) + link_open_tag = cast(re.Match, HTML_LINK_OPEN_TAG_RE.match(link_str)).group( + 1 + ) + attributes = re.findall(HTML_ATTR_RE, link_open_tag) + for attr_data in attributes: + link_data[1].append(attr_data) + links.append((link_data, line_no)) + return links + + +TIANGOLO_COM = "https://fastapi.tiangolo.com" + + +def add_lang_code_if_needed(url: str, prev_url: str, lang_code: str) -> str: + if url.startswith(TIANGOLO_COM): + if prev_url.startswith(f"{TIANGOLO_COM}/{lang_code}"): + url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") + return url + + +def reconstruct_html_link( + attributes: list[tuple[str, str, str]], + link_text: str, + prev_attributes: list[tuple[str, str, str]], + lang_code: str, +) -> str: + prev_attributes_dict = {attr[0]: attr[2] for attr in prev_attributes} + prev_url = prev_attributes_dict["href"] + attributes_upd = [] + for attr_name, attr_quotes, attr_value in attributes: + if attr_name == "href": + attr_value = add_lang_code_if_needed(attr_value, prev_url, lang_code) + attributes_upd.append((attr_name, attr_quotes, attr_value)) + + attrs_str = " ".join( + f"{name}={quetes}{value}{quetes}" for name, quetes, value in attributes_upd + ) + return f"<a {attrs_str}>{link_text}</a>" + + +def replace_html_links(source_text: str, target_text: str, lang: str) -> str: + target_lines = target_text.splitlines() + source_links = extract_html_links(source_text.splitlines()) + target_links = extract_html_links(target_lines) + + if len(source_links) != len(target_links): + raise CompareError( + f"Number of HTML links differs: " + f"{len(source_links)} in source vs {len(target_links)} in target." + ) + + # Sequentially replace attributes of each link URL in target with the one from source + for (src_link_data, _), (tgt_link_data, tgt_line_no) in zip( + source_links, target_links + ): + real_line_no = tgt_line_no - 1 # Convert to zero-based + line = target_lines[real_line_no] + tgt_link_text = tgt_link_data[2] + + tgt_link_original = tgt_link_data[0] + tgt_link_override = reconstruct_html_link( + src_link_data[1], tgt_link_text, tgt_link_data[1], lang + ) + target_lines[real_line_no] = line.replace(tgt_link_original, tgt_link_override) + + target_lines.append("") # To preserve the empty line in the end of the file + return "\n".join(target_lines) + + +# =================================================================================== +# Images + + +# =================================================================================== +# Helper functions + + +def get_lang_doc_root_dir(lang: str) -> Path: + return ROOT / DOCS_ROOT / lang / "docs" + + +def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]: + """ + Iterate on the markdown files to translate in order of priority. + """ + + first_dirs = [ + lang_path_root / "learn", + lang_path_root / "tutorial", + lang_path_root / "advanced", + lang_path_root / "about", + lang_path_root / "how-to", + ] + first_parent = lang_path_root + yield from first_parent.glob("*.md") + for dir_path in first_dirs: + yield from dir_path.rglob("*.md") + first_dirs_str = tuple(str(d) for d in first_dirs) + for path in lang_path_root.rglob("*.md"): + if str(path).startswith(first_dirs_str): + continue + if path.parent == first_parent: + continue + yield path + + +def get_all_paths(lang: str): + res: list[str] = [] + lang_docs_root = get_lang_doc_root_dir(lang) + for path in iter_all_lang_paths(lang_docs_root): + relpath = path.relative_to(lang_docs_root) + if not str(relpath).startswith(non_translated_sections): + res.append(str(relpath)) + return res + + +# =================================================================================== +# Main + + +def process_one_file_with_retry(document_path: str, config: Config) -> bool: + en_docs_root_path = Path(get_lang_doc_root_dir("en")) + lang_docs_root_path = Path(get_lang_doc_root_dir(config.lang)) + while True: + try: + return process_one_file( + en_docs_root_path / document_path, + lang_docs_root_path / document_path, + config=config, + ) + except Retry: # Retry is only raised in interactive mode + pass + + +def process_one_file( + en_doc_path_str: Path, lang_doc_path_str: Path, config: Config +) -> bool: + en_doc_path = Path(en_doc_path_str) + lang_doc_path = Path(lang_doc_path_str) + if not en_doc_path.exists(): + print(f"❌🔎 {en_doc_path_str} - doesn't exist") + return False + + en_doc_text = en_doc_path.read_text(encoding="utf-8") + lang_doc_text = lang_doc_path.read_text(encoding="utf-8") + lang_doc_text_orig = lang_doc_text + + try: + if config.check_code_includes: + lang_doc_text = replace_code_includes( + source_text=en_doc_text, + target_text=lang_doc_text, + ) + if config.check_multiline_blocks: + lang_doc_text = replace_blocks( + source_text=en_doc_text, + target_text=lang_doc_text, + ) + if config.check_headers_and_permalinks: + lang_doc_text = replace_headers_and_permalinks( + source_text=en_doc_text, + target_text=lang_doc_text, + ) + if config.check_markdown_links: + lang_doc_text = replace_markdown_links( + source_text=en_doc_text, + target_text=lang_doc_text, + lang=config.lang, + ) + if config.check_html_links: + lang_doc_text = replace_html_links( + source_text=en_doc_text, + target_text=lang_doc_text, + lang=config.lang, + ) + + except CompareError as e: + print(f"❔❌ {lang_doc_path_str} Error: {e}") + if not config.interactive: + return False + subprocess.run([VSCODE_COMMAND, "--diff", lang_doc_path_str, en_doc_path_str]) + resp = "" + while resp not in ("f", "e"): + resp = input( + " Check the diff, fix the problem, and then type F if it's fixed or E to mark as invalid and skip: " + ) + if resp.lower() == "e": + print(f"❌ {lang_doc_path_str} skipped with error") + return + print(f"Check {lang_doc_path_str} again") + raise Retry() from None + + if lang_doc_text_orig != lang_doc_text: + print(f"❔🆚 {lang_doc_path_str} - non-empty diff") + if not config.interactive: + return False + tmp_path = ROOT / TMP_DOCS_PATH / Path(lang_doc_path_str) + tmp_path.parent.mkdir(parents=True, exist_ok=True) + tmp_path.write_text(lang_doc_text, encoding="utf-8") + subprocess.run( + [VSCODE_COMMAND, "--diff", str(lang_doc_path_str), str(tmp_path)] + ) + resp = "" + while resp not in ("f", "e"): + resp = input( + " Check the diff, fix the problem, and then type F to mark it as fixed or E to to mark as invalid and skip: " + ).lower() + if resp == "e": + print(f"❌ {lang_doc_path_str} skipped with non-empty diff") + return + + print(f"✅ {lang_doc_path_str}") + return True + + +# =================================================================================== +# Typer app + +cli = typer.Typer() + + +@cli.callback() +def callback(): + pass + + +@cli.callback() +def main( + ctx: typer.Context, + lang: Annotated[LANGS, typer.Option()], + interactive: Annotated[ + bool, + typer.Option( + help="If True, will open VSCode diffs for each change to fix and confirm.", + ), + ] = True, + full_paths: Annotated[ + bool, + typer.Option( + help="If True, the provided document paths are treated as full paths.", + ), + ] = False, + check_code_includes: Annotated[ + bool, + typer.Option( + help="If True, will compare code includes blocks.", + ), + ] = True, + check_multiline_blocks: Annotated[ + bool, + typer.Option( + help="If True, will compare multiline code blocks.", + ), + ] = True, + check_headers_and_permalinks: Annotated[ + bool, + typer.Option( + help="If True, will compare headers and permalinks.", + ), + ] = True, + check_markdown_links: Annotated[ + bool, + typer.Option( + help="If True, will compare markdown links.", + ), + ] = True, + check_html_links: Annotated[ + bool, + typer.Option( + help="If True, will compare HTML links.", + ), + ] = True, +): + ctx.obj = Config( + lang=lang, + interactive=interactive, + full_paths=full_paths, + check_code_includes=check_code_includes, + check_multiline_blocks=check_multiline_blocks, + check_headers_and_permalinks=check_headers_and_permalinks, + check_markdown_links=check_markdown_links, + check_html_links=check_html_links, + ) + + +@cli.command() +def process_all( + ctx: typer.Context, +): + """ + Go through all documents of language and compare special blocks with the corresponding + blocks in English versions of those documents. + """ + config = cast(Config, ctx.obj) + lang_docs_root_path = get_lang_doc_root_dir(config.lang) + docs = get_all_paths(config.lang) + + all_good = True + pages_with_errors: list[str] = [] + for doc in docs: + res = process_one_file_with_retry(document_path=doc, config=config) + all_good = all_good and res + if not res: + pages_with_errors.append(doc) + + if not all_good: + print("Some documents had errors:") + docs_path = lang_docs_root_path.relative_to(ROOT) + for page in pages_with_errors: + print(f" - {docs_path / page}") + raise typer.Exit(code=1) + + +@cli.command() +def process_pages( + doc_paths: Annotated[ + list[str], + typer.Argument( + help="List of relative paths to the EN documents. Should be relative to docs/en/docs/", + ), + ], + ctx: typer.Context, +): + """ + Compare special blocks of specified EN documents with the corresponding blocks in + translated versions of those documents. + """ + + config = cast(Config, ctx.obj) + lang_docs_root_path = get_lang_doc_root_dir(config.lang) + + all_good = True + pages_with_errors: list[str] = [] + for doc_path in doc_paths: + if config.full_paths: + path = ROOT / doc_path.lstrip("/") + doc_path = str(path.relative_to(lang_docs_root_path)) + res = process_one_file_with_retry(document_path=doc_path, config=config) + all_good = all_good and res + if not res: + pages_with_errors.append(doc_path) + + if not all_good: + print("Some documents had errors:") + docs_path = lang_docs_root_path.relative_to(ROOT) + for page in pages_with_errors: + print(f" - {docs_path / page}") + raise typer.Exit(code=1) + + +if __name__ == "__main__": + cli()