import re import os from pathlib import Path import requests from .model import Languages, Summary, TranslationDoc from .project_config import get_project_config def get_github_repo_files(project: str = "transformers"): """ Get github repo files """ config = get_project_config(project) # Add GitHub token if available to avoid rate limiting (optional) headers = {} github_token = os.environ.get("GITHUB_TOKEN") if github_token: headers["Authorization"] = f"token {github_token}" response = requests.get(config.api_url, headers=headers) # Handle rate limit with helpful message if response.status_code == 403 and "rate limit" in response.text.lower(): raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}") data = response.json() all_items = data.get("tree", []) file_paths = [ item["path"] for item in all_items if item["type"] == "blob" and (item["path"].startswith("docs")) ] return file_paths def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None): """ Get open PR in the github issue, filtered by title containing '[i18n-KO]'. """ config = get_project_config(project) issue_id = config.github_issues.get(lang) # For projects without GitHub issue tracking, still search for PRs if not issue_id: raise ValueError(f"โš ๏ธ No GitHub issue registered for {project}.") # Require all_files parameter if all_files is None: raise ValueError("Repository file list must be provided") headers = { "Accept": "application/vnd.github+json", } # Add GitHub token if available to avoid rate limiting (optional) github_token = os.environ.get("GITHUB_TOKEN") if github_token: headers["Authorization"] = f"token {github_token}" all_open_prs = [] page = 1 per_page = 100 # Maximum allowed by GitHub API while True: repo_path = config.repo_url.replace("https://github.com/", "") url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}" response = requests.get(url, headers=headers) if response.status_code == 403 and "rate limit" in response.text.lower(): raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}") elif response.status_code != 200: raise Exception(f"GitHub API error: {response.status_code} {response.text}") page_prs = response.json() if not page_prs: # No more PRs break all_open_prs.extend(page_prs) page += 1 # Break if we got less than per_page results (last page) if len(page_prs) < per_page: break filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]] # Pattern to match filenames after "Translated" keyword pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to") def find_original_file_path(filename_from_title, all_files): """Find the exact file path from repo files by matching filename""" if not filename_from_title: return None # Remove .md extension for matching base_name = filename_from_title.replace('.md', '') # Look for exact matches in repo files for file_path in all_files: if file_path.startswith("docs/source/en/") and file_path.endswith(".md"): file_base = file_path.split("/")[-1].replace('.md', '') if file_base == base_name: return file_path # If no exact match, fallback to simple path return f"docs/source/en/{filename_from_title}" filenames = [] pr_info_list = [] for pr in filtered_prs: match = pattern.search(pr["title"]) if match: # Use group 1 (with backticks) or group 2 (without backticks) filename = match.group(1) or match.group(2) # Add .md extension if not present if not filename.endswith('.md'): filename += '.md' # Find the correct file path by matching filename correct_path = None if filename: # Remove .md extension for matching base_name = filename.replace('.md', '') # Look for exact matches in repo files for file_path in all_files: if file_path.startswith("docs/source/en/") and file_path.endswith(".md"): file_base = file_path.split("/")[-1].replace('.md', '') if file_base == base_name: correct_path = file_path break # If no exact match, fallback to simple path if not correct_path: correct_path = f"docs/source/en/{filename}" if correct_path: filenames.append(correct_path) pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}") return filenames, pr_info_list def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]: """ Retrieve missing docs """ report = f""" | Item | Count | Percentage | |------|-------|------------| | ๐Ÿ“‚ HuggingFaces docs | {summary.files_analyzed} | - | | ๐Ÿชน Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% | """ print(report) first_missing_docs = list() for file in summary.first_missing_translation_files(table_size): first_missing_docs.append(file.original_file) print(first_missing_docs) return report, first_missing_docs def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]: """ Generate a report for the translated docs """ if docs_file is None: raise ValueError("Repository file list must be provided") base_docs_path = Path("docs/source") en_docs_path = Path("docs/source/en") lang = Languages[target_lang] summary = Summary(lang=lang.value) for file in docs_file: if file.endswith(".md"): try: file_relative_path = Path(file).relative_to(en_docs_path) except ValueError: continue translated_path = os.path.join( base_docs_path, lang.value, file_relative_path ) translation_exists = translated_path in docs_file doc = TranslationDoc( translation_lang=lang.value, original_file=file, translation_file=translated_path, translation_exists=translation_exists, ) summary.append_file(doc) return retrieve(summary, top_k)