Felix92's picture
update to multifile
cc2aa56
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"
def get_supported_chars(font_path: Path) -> list[str]:
try:
font = TTFont(font_path)
supported_chars = set()
for table in font["cmap"].tables:
supported_chars.update(table.cmap.keys())
chars = [chr(code_point) for code_point in sorted(supported_chars)]
return [char for char in chars if char.isprintable()]
except TTLibError as e:
logging.error(f"Error reading font file {font_path}: {e}")
return []
except Exception as e:
logging.error(f"Unexpected error reading font file {font_path}: {e}")
return []
def get_sha256(file_path: Path) -> str:
hash_sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
files = API.list_repo_files(
repo_id=REPO_ID,
repo_type="dataset",
token=TOKEN,
)
return any(file.startswith(f"{subfolder}/{file_name}") for file in files)
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
filename = f"{sha_hash}_{Path(file_path).name}"
repo_path = f"{subfolder}/{filename}"
API.upload_file(
path_or_fileobj=file_path,
path_in_repo=repo_path,
token=TOKEN,
repo_type="dataset",
repo_id=REPO_ID,
)
logging.info(f"Uploaded {repo_path}")
def handle_uploads(font_upload, wordlist_upload, agree):
if not agree:
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None
font_upload = font_upload or []
wordlist_upload = wordlist_upload or []
results = []
try:
# Handle fonts
for font_file in font_upload:
font_path = Path(font_file)
font_sha = get_sha256(font_path)
if file_exists_on_hub(font_sha, "fonts"):
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
continue
supported_chars = get_supported_chars(font_path)
if not supported_chars:
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
continue
metadata = {
"font_name": font_path.stem,
"supported_characters": supported_chars,
}
json_path = font_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
json_sha = get_sha256(json_path)
_upload_hub(str(font_path), "fonts", font_sha)
_upload_hub(str(json_path), "fonts", json_sha)
results.append(f"✅ Font **{font_path.name}** uploaded successfully.")
# Handle wordlists
for wordlist_file in wordlist_upload:
wordlist_path = Path(wordlist_file)
wordlist_sha = get_sha256(wordlist_path)
if file_exists_on_hub(wordlist_sha, "wordlists"):
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
continue
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")
if not results:
results.append("⚠️ No files uploaded.")
result_md = "<br>".join(results)
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None)
except Exception as e:
logging.exception("Upload failed")
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)
with gr.Blocks(fill_height=True) as demo:
agreement_markdown = gr.Markdown(
"""
<div style="text-align: center;">
<h1>File Upload Agreement</h1>
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3>
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
</div>
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>
<br>
<br>
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
"""
)
agree_button = gr.Button("I Agree to the Terms and Conditions")
agree_state = gr.State(value=False)
with gr.Column(visible=False) as upload_section:
success_message = gr.Markdown(visible=True)
font_upload = gr.File(
label="Upload Font File(s) [TTF | OTF]",
file_types=[".ttf", ".otf"],
type="filepath",
file_count="multiple"
)
wordlist_upload = gr.File(
label="Upload Wordlist(s) [TXT]",
file_types=[".txt"],
type="filepath",
file_count="multiple"
)
submit_button = gr.Button("Submit")
def toggle_agreement_visibility():
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])
submit_button.click(
fn=handle_uploads,
inputs=[font_upload, wordlist_upload, agree_state],
outputs=[agree_button, success_message, font_upload, wordlist_upload],
)
if __name__ == "__main__":
demo.launch()