""" List commits in the private repo that need to be synced to the OSS repo. NOTE: 1. This script resolves the git root automatically and can be run anywhere inside the repo. This script will: 1. Find the most recent sync commit (message starts with "[Automated PR] Copy OSS code from commit"). 2. Scan commits after that point and keep those that touch the configured paths. 3. Compare added diff lines in relevant files against OSS main. 4. Print a markdown summary with commit links and write it to GitHub Step Summary. Usage: python3 scripts/code_sync/check_commits.py """ import argparse import os import shutil import subprocess import sys from dataclasses import dataclass from typing import Dict, List, Optional, Set, Tuple # Allow sibling imports regardless of the working directory. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from utils import ( # noqa: E402 FOLDER_NAMES, get_last_sync_commit, write_github_step_summary, ) # --- Configuration Begin --- private_repo = "your-org/sglang-private-repo" oss_repo_url = "https://github.com/sgl-project/sglang.git" oss_repo_branch = "main" default_oss_repo_dir = ".oss_repo" # --- Configuration End --- @dataclass class CommitInfo: commit_hash: str subject: str commit_date: str relevant_files: List[str] synced_lines: int total_added_lines: int def check_dependencies() -> None: """Check for required command-line tools.""" if not shutil.which("git"): raise EnvironmentError("git is not installed or not in PATH.") def get_repo_root() -> str: try: output = subprocess.run( ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True, check=True, ).stdout.strip() except subprocess.CalledProcessError as e: raise RuntimeError(f"Unable to determine git repo root: {e.stderr or e}") from e if not output: raise RuntimeError("Unable to determine git repo root.") return os.path.abspath(output) def get_repo_from_origin(repo_root: str) -> str: """Try to infer the repo slug (owner/name) from git remote.origin.url.""" try: url = subprocess.run( ["git", "config", "--get", "remote.origin.url"], capture_output=True, text=True, check=True, cwd=repo_root, ).stdout.strip() except subprocess.CalledProcessError: return private_repo if url.startswith("git@github.com:"): repo = url.split("git@github.com:", 1)[1] elif url.startswith("https://github.com/"): repo = url.split("https://github.com/", 1)[1] else: return private_repo if repo.endswith(".git"): repo = repo[: -len(".git")] return repo or private_repo def get_default_oss_repo_path(repo_root: str) -> str: env_path = os.environ.get("OSS_REPO_PATH") if env_path: return os.path.abspath(env_path) return os.path.abspath(os.path.join(repo_root, default_oss_repo_dir)) def ensure_oss_repo(oss_repo_path: str, repo_url: str, branch: str) -> str: oss_repo_path = os.path.abspath(oss_repo_path) if os.path.exists(oss_repo_path) and not os.path.isdir(oss_repo_path): raise RuntimeError(f"OSS repo path is not a directory: {oss_repo_path}") if os.path.isdir(os.path.join(oss_repo_path, ".git")): try: subprocess.run( ["git", "-C", oss_repo_path, "rev-parse", "--is-inside-work-tree"], capture_output=True, text=True, check=True, ) except subprocess.CalledProcessError as e: raise RuntimeError( f"OSS repo path exists but is not a git repo: {oss_repo_path}" ) from e subprocess.run( ["git", "-C", oss_repo_path, "fetch", "origin", branch, "--depth", "1"], check=True, ) return oss_repo_path parent_dir = os.path.dirname(oss_repo_path) if parent_dir and not os.path.isdir(parent_dir): os.makedirs(parent_dir, exist_ok=True) subprocess.run( ["git", "clone", "--depth", "1", "--branch", branch, repo_url, oss_repo_path], check=True, ) return oss_repo_path def get_commits_since(repo_root: str, last_sync_hash: Optional[str]) -> List[str]: """Get commit hashes from last sync commit (exclusive) to HEAD.""" try: if last_sync_hash: command = ["git", "rev-list", f"{last_sync_hash}..HEAD"] else: command = ["git", "rev-list", "HEAD"] result = subprocess.run( command, capture_output=True, text=True, check=True, cwd=repo_root ).stdout.strip() return [line for line in result.split("\n") if line] except subprocess.CalledProcessError as e: print(f"Error getting commit list: {e.stderr}") return [] def get_changed_files(repo_root: str, commit_hash: str) -> List[str]: try: output = subprocess.run( ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash], capture_output=True, text=True, check=True, cwd=repo_root, ).stdout.strip() return [line for line in output.split("\n") if line] except subprocess.CalledProcessError as e: print(f"Error getting changed files for {commit_hash}: {e.stderr}") return [] def is_relevant_path(changed_file: str, path_prefix: str) -> bool: if changed_file == path_prefix: return True return changed_file.startswith(f"{path_prefix}/") def get_relevant_files(changed_files: List[str]) -> List[str]: return [ changed_file for changed_file in changed_files if any(is_relevant_path(changed_file, path) for path in FOLDER_NAMES) ] def get_added_lines_by_file( repo_root: str, commit_hash: str, relevant_files: List[str] ) -> Dict[str, List[str]]: if not relevant_files: return {} command = [ "git", "show", "--no-color", "--unified=0", "--format=", commit_hash, "--", ] + relevant_files try: output = subprocess.run( command, capture_output=True, text=True, check=True, cwd=repo_root ).stdout except subprocess.CalledProcessError as e: print(f"Error getting diff for {commit_hash}: {e.stderr}") return {} added_lines: Dict[str, List[str]] = {path: [] for path in relevant_files} relevant_set = set(relevant_files) current_file: Optional[str] = None for line in output.splitlines(): if line.startswith("diff --git "): current_file = None continue if line.startswith("+++ "): file_path = None if line.startswith("+++ b/"): file_path = line[6:] else: candidate = line[4:] if candidate == "/dev/null": file_path = None elif candidate.startswith("b/") or candidate.startswith("a/"): file_path = candidate[2:] else: file_path = candidate if file_path in relevant_set: current_file = file_path else: current_file = None continue if current_file and line.startswith("+") and not line.startswith("+++ "): added_lines[current_file].append(line[1:]) return added_lines def get_oss_file_lines( oss_repo_path: str, oss_ref: str, file_path: str, cache: Dict[str, Optional[Set[str]]], ) -> Optional[Set[str]]: if file_path in cache: return cache[file_path] try: output = subprocess.run( ["git", "-C", oss_repo_path, "show", f"{oss_ref}:{file_path}"], capture_output=True, text=True, errors="replace", check=True, ).stdout except subprocess.CalledProcessError: cache[file_path] = None return None lines = output.splitlines() line_set = set(lines) cache[file_path] = line_set return line_set def count_synced_lines( added_lines_by_file: Dict[str, List[str]], oss_repo_path: str, oss_ref: str, oss_file_cache: Dict[str, Optional[Set[str]]], ) -> Tuple[int, int]: total_added_lines = 0 synced_lines = 0 for file_path, lines in added_lines_by_file.items(): total_added_lines += len(lines) if not lines: continue oss_lines = get_oss_file_lines( oss_repo_path, oss_ref, file_path, oss_file_cache ) if not oss_lines: continue for line in lines: if line in oss_lines: synced_lines += 1 return synced_lines, total_added_lines def get_commit_summary(repo_root: str, commit_hash: str) -> Tuple[str, str]: """Return (subject, date) for a commit.""" try: output = subprocess.run( ["git", "show", "-s", "--format=%s%x00%ad", "--date=short", commit_hash], capture_output=True, text=True, check=True, cwd=repo_root, ).stdout.strip() subject, commit_date = output.split("\x00", 1) except subprocess.CalledProcessError as e: print(f"Error getting commit subject for {commit_hash}: {e.stderr}") subject = "(unknown subject)" commit_date = "(unknown date)" return subject, commit_date def format_files_list(relevant_files: List[str]) -> str: return "\n".join([f"- {file_path}" for file_path in relevant_files]) def format_last_sync_block( repo: str, subject: str, commit_hash: str, commit_date: str ) -> str: short_hash = commit_hash[:9] commit_url = f"https://github.com/{repo}/commit/{commit_hash}" return "\n".join( [ "## Last sync", "", f"#### {subject}", f"date: {commit_date}", f"commit: [{short_hash}]({commit_url})", "", ] ) def format_commit_block( repo: str, subject: str, commit_hash: str, commit_date: str, relevant_files: List[str], synced_lines: int, total_added_lines: int, ) -> str: short_hash = commit_hash[:9] commit_url = f"https://github.com/{repo}/commit/{commit_hash}" files_str = format_files_list(relevant_files) if relevant_files else "- None" status_icon = "✅" if synced_lines == total_added_lines else "❌" status_line = ( f"status: {status_icon} {synced_lines}/{total_added_lines} lines synced" ) return "\n".join( [ f"#### {subject}", status_line, f"date: {commit_date}", "files to sync:", files_str, "", f"commit: [{short_hash}]({commit_url})", "", ] ) def format_output( repo: str, last_sync: Optional[Tuple[str, str, str]], commits: List[CommitInfo], ) -> str: lines: List[str] = [] if last_sync: subject, commit_hash, commit_date = last_sync lines.append(format_last_sync_block(repo, subject, commit_hash, commit_date)) else: lines.extend(["## Last sync", "", "No sync commit found.", ""]) lines.extend(["## Commits to sync", ""]) if not commits: lines.append("No commits need to be synced.") return "\n".join(lines) + "\n" for commit in commits: lines.append( format_commit_block( repo, commit.subject, commit.commit_hash, commit.commit_date, commit.relevant_files, commit.synced_lines, commit.total_added_lines, ) ) return "\n".join(lines) def main() -> None: parser = argparse.ArgumentParser( description="List commits in the private repo that need to be synced to OSS." ) parser.add_argument( "--limit", type=int, default=0, help="Limit number of commits printed (0 means no limit).", ) parser.add_argument( "--oss-repo-path", default=None, help="Path to OSS repo clone (default: $OSS_REPO_PATH or .oss_repo).", ) parser.add_argument( "--oss-repo-url", default=oss_repo_url, help="OSS repo URL (default: https://github.com/sgl-project/sglang.git).", ) parser.add_argument( "--oss-branch", default=oss_repo_branch, help="OSS repo branch to check (default: main).", ) args = parser.parse_args() check_dependencies() repo_root = get_repo_root() oss_repo_path = ( os.path.abspath(args.oss_repo_path) if args.oss_repo_path else get_default_oss_repo_path(repo_root) ) repo = get_repo_from_origin(repo_root) last_sync_hash = get_last_sync_commit(repo_root) last_sync_block = None if last_sync_hash: last_sync_subject, last_sync_date = get_commit_summary( repo_root, last_sync_hash ) last_sync_block = (last_sync_subject, last_sync_hash, last_sync_date) commits = get_commits_since(repo_root, last_sync_hash) if args.limit > 0: commits = commits[: args.limit] relevant_commit_inputs: List[Tuple[str, List[str]]] = [] for commit_hash in commits: changed_files = get_changed_files(repo_root, commit_hash) if not changed_files: continue relevant_files = get_relevant_files(changed_files) if relevant_files: relevant_commit_inputs.append((commit_hash, relevant_files)) relevant_commits: List[CommitInfo] = [] if relevant_commit_inputs: oss_repo_path = ensure_oss_repo( oss_repo_path, args.oss_repo_url, args.oss_branch ) oss_ref = f"origin/{args.oss_branch}" oss_file_cache: Dict[str, Optional[Set[str]]] = {} for commit_hash, relevant_files in relevant_commit_inputs: subject, commit_date = get_commit_summary(repo_root, commit_hash) added_lines_by_file = get_added_lines_by_file( repo_root, commit_hash, relevant_files ) synced_lines, total_added_lines = count_synced_lines( added_lines_by_file, oss_repo_path, oss_ref, oss_file_cache ) relevant_commits.append( CommitInfo( commit_hash=commit_hash, subject=subject, commit_date=commit_date, relevant_files=relevant_files, synced_lines=synced_lines, total_added_lines=total_added_lines, ) ) output = format_output(repo, last_sync_block, relevant_commits) print(output) if os.environ.get("GITHUB_STEP_SUMMARY"): write_github_step_summary(output) if __name__ == "__main__": main()