|
|
|
|
|
""" |
|
|
Sync BitTransformerLM repository to HuggingFace Hub for OS launch. |
|
|
Uploads all cleaned documentation and code with proper commit message. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import logging |
|
|
import re |
|
|
from pathlib import Path |
|
|
from huggingface_hub import HfApi, login |
|
|
from typing import Optional, List |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def scan_for_secrets(file_path: Path) -> List[str]: |
|
|
"""Scan a file for potential secrets and tokens.""" |
|
|
secrets_found = [] |
|
|
|
|
|
|
|
|
secret_patterns = { |
|
|
'HuggingFace Token': r'hf_[A-Za-z0-9_]{30,}', |
|
|
'OpenAI API Key': r'sk-[A-Za-z0-9]{48}', |
|
|
'GitHub Token': r'gh[pousr]_[A-Za-z0-9_]{36,}', |
|
|
'AWS Access Key': r'AKIA[0-9A-Z]{16}', |
|
|
'Generic API Key': r'["\']?[Aa]pi[_-]?[Kk]ey["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
|
|
'Generic Token': r'["\']?[Tt]oken["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
|
|
'Generic Secret': r'["\']?[Ss]ecret["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
|
|
} |
|
|
|
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
|
|
content = f.read() |
|
|
|
|
|
for secret_type, pattern in secret_patterns.items(): |
|
|
matches = re.finditer(pattern, content, re.IGNORECASE) |
|
|
for match in matches: |
|
|
line_num = content[:match.start()].count('\n') + 1 |
|
|
secrets_found.append(f"{secret_type} found at line {line_num}: {match.group()[:50]}...") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Could not scan {file_path} for secrets: {e}") |
|
|
|
|
|
return secrets_found |
|
|
|
|
|
|
|
|
def get_files_to_sync(repo_root: Path) -> List[Path]: |
|
|
"""Get the exact list of files that will be synced to HuggingFace.""" |
|
|
|
|
|
include_patterns = [ |
|
|
|
|
|
"bit_transformer/**/*.py", |
|
|
"tests/**/*.py", |
|
|
"scripts/**/*.py", |
|
|
"scripts/**/*.md", |
|
|
|
|
|
|
|
|
"*.py", |
|
|
"*.md", |
|
|
"*.txt", |
|
|
"*.toml", |
|
|
"*.sh", |
|
|
"Dockerfile", |
|
|
|
|
|
|
|
|
"LICENSE/**/*", |
|
|
] |
|
|
|
|
|
|
|
|
exclude_patterns = [ |
|
|
"__pycache__/**", |
|
|
"*.pyc", |
|
|
".git/**", |
|
|
".pytest_cache/**", |
|
|
".ipynb_checkpoints/**", |
|
|
"weights/**", |
|
|
"checkpoints/**", |
|
|
"*.log", |
|
|
"*.pt", |
|
|
"*.zip", |
|
|
|
|
|
"*-checkpoint.*", |
|
|
"*.tmp", |
|
|
"*.swp", |
|
|
|
|
|
".DS_Store", |
|
|
"Thumbs.db", |
|
|
] |
|
|
|
|
|
|
|
|
files_to_upload = [] |
|
|
for pattern in include_patterns: |
|
|
for file_path in repo_root.glob(pattern): |
|
|
if file_path.is_file(): |
|
|
|
|
|
relative_path = file_path.relative_to(repo_root) |
|
|
should_exclude = any( |
|
|
relative_path.match(exclude) |
|
|
for exclude in exclude_patterns |
|
|
) |
|
|
if not should_exclude: |
|
|
files_to_upload.append(file_path) |
|
|
|
|
|
return sorted(files_to_upload) |
|
|
|
|
|
|
|
|
def preview_sync(repo_root: Path = None) -> None: |
|
|
"""Preview what files will be synced without actually uploading.""" |
|
|
if repo_root is None: |
|
|
repo_root = Path(__file__).parent.parent.parent |
|
|
|
|
|
files_to_upload = get_files_to_sync(repo_root) |
|
|
|
|
|
print(f"\nπ Repository root: {repo_root}") |
|
|
print(f"π¦ Files to sync: {len(files_to_upload)}") |
|
|
print("\nπ File list:") |
|
|
|
|
|
for file_path in files_to_upload: |
|
|
relative_path = file_path.relative_to(repo_root) |
|
|
file_size = file_path.stat().st_size |
|
|
print(f" {relative_path} ({file_size:,} bytes)") |
|
|
|
|
|
total_size = sum(f.stat().st_size for f in files_to_upload) |
|
|
print(f"\nπ Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)") |
|
|
|
|
|
|
|
|
def sync_repository_to_hf( |
|
|
repo_id: str = "WCNegentropy/BitTransformerLM", |
|
|
token: Optional[str] = None, |
|
|
commit_message: str = "π Refined BitTransformerLM: Organized codebase with best practices", |
|
|
preview_only: bool = False |
|
|
): |
|
|
""" |
|
|
Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub. |
|
|
|
|
|
Args: |
|
|
repo_id: HuggingFace repository ID |
|
|
token: HF token (defaults to HF_TOKEN environment variable) |
|
|
commit_message: Commit message for the upload |
|
|
""" |
|
|
|
|
|
|
|
|
if token is None: |
|
|
token = os.environ.get('HF_TOKEN') |
|
|
if not token: |
|
|
logger.error("HF_TOKEN environment variable not set and no token provided") |
|
|
return False |
|
|
|
|
|
try: |
|
|
|
|
|
login(token=token) |
|
|
api = HfApi() |
|
|
logger.info("Successfully authenticated with HuggingFace Hub") |
|
|
|
|
|
|
|
|
repo_root = Path(__file__).parent.parent.parent |
|
|
logger.info(f"Repository root: {repo_root}") |
|
|
|
|
|
|
|
|
files_to_upload = get_files_to_sync(repo_root) |
|
|
logger.info(f"Found {len(files_to_upload)} files to upload") |
|
|
|
|
|
|
|
|
logger.info("π Scanning files for secrets and tokens...") |
|
|
all_secrets = [] |
|
|
for file_path in files_to_upload: |
|
|
secrets = scan_for_secrets(file_path) |
|
|
if secrets: |
|
|
relative_path = file_path.relative_to(repo_root) |
|
|
all_secrets.extend([f"{relative_path}: {secret}" for secret in secrets]) |
|
|
|
|
|
if all_secrets: |
|
|
logger.error("π¨ SECURITY ALERT: Secrets detected in files!") |
|
|
logger.error("The following secrets were found and MUST be removed before sync:") |
|
|
for secret in all_secrets: |
|
|
logger.error(f" - {secret}") |
|
|
logger.error("β SYNC ABORTED for security reasons!") |
|
|
logger.error("Please remove all secrets and use environment variables instead.") |
|
|
return False |
|
|
|
|
|
logger.info("β
Security scan passed - no secrets detected") |
|
|
|
|
|
|
|
|
if preview_only: |
|
|
preview_sync(repo_root) |
|
|
return True |
|
|
|
|
|
|
|
|
logger.info("Syncing entire repository structure to HuggingFace...") |
|
|
|
|
|
try: |
|
|
|
|
|
import tempfile |
|
|
import shutil |
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
temp_path = Path(temp_dir) |
|
|
|
|
|
|
|
|
for file_path in files_to_upload: |
|
|
relative_path = file_path.relative_to(repo_root) |
|
|
dest_path = temp_path / relative_path |
|
|
dest_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
shutil.copy2(file_path, dest_path) |
|
|
|
|
|
logger.info(f"Prepared {len(files_to_upload)} files for upload") |
|
|
|
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=str(temp_path), |
|
|
repo_id=repo_id, |
|
|
repo_type="model", |
|
|
commit_message=commit_message, |
|
|
commit_description=""" |
|
|
BitTransformerLM refined with ML engineering best practices: |
|
|
|
|
|
β
**Organized Codebase Structure** |
|
|
- Cleaned up 30+ scattered scripts into organized directories |
|
|
- Standardized imports and docstring formatting |
|
|
- Consolidated configuration management |
|
|
- Professional package metadata |
|
|
|
|
|
β
**Enhanced Developer Experience** |
|
|
- Comprehensive CLI interface with standardized arguments |
|
|
- Type-safe configuration system with presets |
|
|
- Improved error handling and logging |
|
|
- Better modular organization |
|
|
|
|
|
β
**Production Quality** |
|
|
- PyProject.toml with proper dependencies and tooling |
|
|
- Consistent code formatting and documentation |
|
|
- Maintainable directory structure |
|
|
- Ready for serious development and research |
|
|
|
|
|
The bit-native transformer architecture with reversible layers, safety telemetry, |
|
|
and distributed training capabilities is now properly packaged for research use. |
|
|
""".strip(), |
|
|
delete_patterns=["*"] |
|
|
) |
|
|
|
|
|
uploaded_count = len(files_to_upload) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to upload folder: {e}") |
|
|
logger.info("Falling back to individual file upload...") |
|
|
|
|
|
|
|
|
uploaded_count = 0 |
|
|
for file_path in files_to_upload: |
|
|
try: |
|
|
relative_path = file_path.relative_to(repo_root) |
|
|
logger.info(f"Uploading: {relative_path}") |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=str(file_path), |
|
|
path_in_repo=str(relative_path), |
|
|
repo_id=repo_id, |
|
|
repo_type="model", |
|
|
commit_message=commit_message, |
|
|
) |
|
|
|
|
|
uploaded_count += 1 |
|
|
if uploaded_count % 10 == 0: |
|
|
logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to upload {relative_path}: {e}") |
|
|
continue |
|
|
|
|
|
logger.info(f"β
Successfully uploaded {uploaded_count}/{len(files_to_upload)} files") |
|
|
logger.info(f"π Repository synced to: https://huggingface.co/{repo_id}") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to sync repository: {e}") |
|
|
return False |
|
|
|
|
|
def create_release_info(): |
|
|
"""Create a release information file for the OS launch.""" |
|
|
release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release |
|
|
|
|
|
**Release Date:** August 2025 |
|
|
**Status:** Open Source Research Implementation |
|
|
**License:** AGPLv3 + Commercial Licensing Available |
|
|
|
|
|
## What's Included |
|
|
|
|
|
This release provides a complete experimental framework for bit-native language modeling research: |
|
|
|
|
|
- **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers |
|
|
- **Safety Systems:** Real-time K/C/S telemetry and monitoring |
|
|
- **Research Tools:** Interactive dashboard, distributed training, comprehensive testing |
|
|
- **Documentation:** Professional model card, research status, and validation reports |
|
|
|
|
|
## Important Notes |
|
|
|
|
|
β οΈ **Experimental Status:** This is research code requiring rigorous baseline validation |
|
|
β οΈ **Not Production Ready:** Needs extensive evaluation vs standard transformers |
|
|
β οΈ **Research Use Only:** Intended for academic investigation and experimentation |
|
|
|
|
|
## Licensing |
|
|
|
|
|
- **Open Source:** AGPLv3 for research and open source use |
|
|
- **Commercial:** Contact contact@wcnegentropy.com for commercial licensing |
|
|
|
|
|
## Next Steps |
|
|
|
|
|
The research community is invited to: |
|
|
1. Conduct rigorous baseline comparisons vs standard transformers |
|
|
2. Evaluate on established language modeling benchmarks |
|
|
3. Validate (or refute) claimed memory efficiency benefits |
|
|
4. Share findings openly to advance the field |
|
|
|
|
|
**Research responsibly. Validate rigorously. Share openly.** |
|
|
""" |
|
|
|
|
|
release_file = Path(__file__).parent / "RELEASE_INFO.md" |
|
|
with open(release_file, 'w') as f: |
|
|
f.write(release_info) |
|
|
|
|
|
logger.info("Created RELEASE_INFO.md") |
|
|
return release_file |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub") |
|
|
parser.add_argument("--preview", action="store_true", help="Preview files without uploading") |
|
|
parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID") |
|
|
parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)") |
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.preview: |
|
|
print("π Preview mode: showing files that would be synced...") |
|
|
preview_sync() |
|
|
print("\nβ
Use --token YOUR_TOKEN to perform actual sync") |
|
|
else: |
|
|
|
|
|
create_release_info() |
|
|
|
|
|
|
|
|
success = sync_repository_to_hf( |
|
|
repo_id=args.repo_id, |
|
|
token=args.token |
|
|
) |
|
|
|
|
|
if success: |
|
|
print(f"\nπ BitTransformerLM Sync Complete!") |
|
|
print(f"π Repository: https://huggingface.co/{args.repo_id}") |
|
|
print("\nRefined codebase with ML engineering best practices is now live! β¨") |
|
|
else: |
|
|
print("\nβ Sync failed. Please check logs and try again.") |