import gradio as gr import os import subprocess from huggingface_hub import snapshot_download, login, HfApi import tempfile import shutil # Function to download, convert, and quantize model to GGUF def convert_to_gguf(model_id, quantization_method, hf_token=None): try: # Create temporary directories for model files and output temp_dir = tempfile.mkdtemp() output_dir = tempfile.mkdtemp() # Log in to Hugging Face if token is provided (for gated models) if hf_token: login(hf_token) # Download the model from Hugging Face model_path = snapshot_download(repo_id=model_id, local_dir=temp_dir, local_dir_use_symlinks=False) # Clone llama.cpp repository if not already present llama_cpp_dir = "/tmp/llama.cpp" if not os.path.exists(llama_cpp_dir): subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", llama_cpp_dir], check=True) # Install llama.cpp Python dependencies subprocess.run(["pip", "install", "-r", f"{llama_cpp_dir}/requirements.txt"], check=True) # Create build directory for CMake build_dir = os.path.join(llama_cpp_dir, "build") os.makedirs(build_dir, exist_ok=True) # Run CMake to configure and build subprocess.run(["cmake", ".."], cwd=build_dir, check=True) subprocess.run(["cmake", "--build", ".", "--config", "Release"], cwd=build_dir, check=True) # Convert model to GGUF with f16 precision intermediate_gguf = os.path.join(output_dir, f"{model_id.replace('/', '-')}-f16.gguf") convert_cmd = [ "python", f"{llama_cpp_dir}/convert_hf_to_gguf.py", model_path, "--outfile", intermediate_gguf, "--outtype", "f16" ] subprocess.run(convert_cmd, check=True) # Quantize the GGUF file to the selected method output_gguf = os.path.join(output_dir, f"{model_id.replace('/', '-')}-{quantization_method}.gguf") quantize_cmd = [ os.path.join(build_dir, "bin", "quantize"), intermediate_gguf, output_gguf, quantization_method ] subprocess.run(quantize_cmd, check=True) # Return the path to the quantized GGUF file return output_gguf, f"Model converted and quantized successfully! Download the GGUF file: {output_gguf}" except Exception as e: return None, f"Error: {str(e)}" finally: # Clean up temporary directories if os.path.exists(temp_dir): shutil.rmtree(temp_dir) if os.path.exists(llama_cpp_dir): shutil.rmtree(llama_cpp_dir) # Gradio interface def gradio_app(): with gr.Blocks() as demo: gr.Markdown("# Hugging Face to GGUF Converter") gr.Markdown("Enter a Hugging Face model ID and select a quantization method to convert the model to GGUF format.") with gr.Row(): model_id_input = gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., meta-llama/Llama-3.2-3B-Instruct") hf_token_input = gr.Textbox(label="Hugging Face Token (optional for gated models)", type="password", placeholder="Enter your HF token") with gr.Row(): quantization_method = gr.Dropdown( choices=["q8_0", "q4_k_m", "q4_0", "q5_0", "q5_k_m", "q6_k", "q3_k_m"], label="Quantization Method", value="q4_k_m" ) convert_button = gr.Button("Convert to GGUF") output_file = gr.File(label="Download GGUF File") output_message = gr.Textbox(label="Status") convert_button.click( fn=convert_to_gguf, inputs=[model_id_input, quantization_method, hf_token_input], outputs=[output_file, output_message] ) return demo if __name__ == "__main__": gradio_app().launch()