|
import os |
|
import gradio as gr |
|
import logging |
|
import json |
|
import hashlib |
|
from pathlib import Path |
|
from fontTools.ttLib import TTFont, TTLibError |
|
from huggingface_hub import HfApi |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
API = HfApi() |
|
TOKEN = os.environ.get("TOKEN") |
|
REPO_ID = "Felix92/docTR-resource-collection" |
|
|
|
def get_supported_chars(font_path: Path) -> list[str]: |
|
try: |
|
font = TTFont(font_path) |
|
supported_chars = set() |
|
for table in font["cmap"].tables: |
|
supported_chars.update(table.cmap.keys()) |
|
chars = [chr(code_point) for code_point in sorted(supported_chars)] |
|
return [char for char in chars if char.isprintable()] |
|
except TTLibError as e: |
|
logging.error(f"Error reading font file {font_path}: {e}") |
|
return [] |
|
except Exception as e: |
|
logging.error(f"Unexpected error reading font file {font_path}: {e}") |
|
return [] |
|
|
|
def get_sha256(file_path: Path) -> str: |
|
hash_sha256 = hashlib.sha256() |
|
with open(file_path, "rb") as f: |
|
for chunk in iter(lambda: f.read(8192), b""): |
|
hash_sha256.update(chunk) |
|
return hash_sha256.hexdigest() |
|
|
|
def file_exists_on_hub(file_name: str, subfolder: str) -> bool: |
|
files = API.list_repo_files( |
|
repo_id=REPO_ID, |
|
repo_type="dataset", |
|
token=TOKEN, |
|
) |
|
return any(file.startswith(f"{subfolder}/{file_name}") for file in files) |
|
|
|
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None: |
|
filename = f"{sha_hash}_{Path(file_path).name}" |
|
repo_path = f"{subfolder}/{filename}" |
|
API.upload_file( |
|
path_or_fileobj=file_path, |
|
path_in_repo=repo_path, |
|
token=TOKEN, |
|
repo_type="dataset", |
|
repo_id=REPO_ID, |
|
) |
|
logging.info(f"Uploaded {repo_path}") |
|
|
|
def handle_uploads(font_upload, wordlist_upload, agree): |
|
if not agree: |
|
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None |
|
|
|
font_upload = font_upload or [] |
|
wordlist_upload = wordlist_upload or [] |
|
|
|
results = [] |
|
|
|
try: |
|
|
|
for font_file in font_upload: |
|
font_path = Path(font_file) |
|
font_sha = get_sha256(font_path) |
|
if file_exists_on_hub(font_sha, "fonts"): |
|
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.") |
|
continue |
|
|
|
supported_chars = get_supported_chars(font_path) |
|
if not supported_chars: |
|
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.") |
|
continue |
|
|
|
metadata = { |
|
"font_name": font_path.stem, |
|
"supported_characters": supported_chars, |
|
} |
|
json_path = font_path.with_suffix(".json") |
|
with open(json_path, "w", encoding="utf-8") as f: |
|
json.dump(metadata, f, ensure_ascii=False, indent=2) |
|
|
|
json_sha = get_sha256(json_path) |
|
|
|
_upload_hub(str(font_path), "fonts", font_sha) |
|
_upload_hub(str(json_path), "fonts", json_sha) |
|
results.append(f"✅ Font **{font_path.name}** uploaded successfully.") |
|
|
|
|
|
for wordlist_file in wordlist_upload: |
|
wordlist_path = Path(wordlist_file) |
|
wordlist_sha = get_sha256(wordlist_path) |
|
if file_exists_on_hub(wordlist_sha, "wordlists"): |
|
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.") |
|
continue |
|
|
|
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha) |
|
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.") |
|
|
|
if not results: |
|
results.append("⚠️ No files uploaded.") |
|
|
|
result_md = "<br>".join(results) |
|
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None) |
|
|
|
except Exception as e: |
|
logging.exception("Upload failed") |
|
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None) |
|
|
|
|
|
with gr.Blocks(fill_height=True) as demo: |
|
agreement_markdown = gr.Markdown( |
|
""" |
|
<div style="text-align: center;"> |
|
<h1>File Upload Agreement</h1> |
|
|
|
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3> |
|
|
|
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3> |
|
|
|
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3> |
|
</div> |
|
|
|
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3> |
|
|
|
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3> |
|
|
|
<br> |
|
<br> |
|
|
|
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3> |
|
|
|
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3> |
|
|
|
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3> |
|
|
|
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3> |
|
""" |
|
) |
|
agree_button = gr.Button("I Agree to the Terms and Conditions") |
|
agree_state = gr.State(value=False) |
|
|
|
with gr.Column(visible=False) as upload_section: |
|
success_message = gr.Markdown(visible=True) |
|
font_upload = gr.File( |
|
label="Upload Font File(s) [TTF | OTF]", |
|
file_types=[".ttf", ".otf"], |
|
type="filepath", |
|
file_count="multiple" |
|
) |
|
wordlist_upload = gr.File( |
|
label="Upload Wordlist(s) [TXT]", |
|
file_types=[".txt"], |
|
type="filepath", |
|
file_count="multiple" |
|
) |
|
submit_button = gr.Button("Submit") |
|
|
|
def toggle_agreement_visibility(): |
|
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True) |
|
|
|
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section]) |
|
|
|
submit_button.click( |
|
fn=handle_uploads, |
|
inputs=[font_upload, wordlist_upload, agree_state], |
|
outputs=[agree_button, success_message, font_upload, wordlist_upload], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|