Spaces:

kurakurai
/

Luth-Demo

Running

File size: 2,382 Bytes

6b972fe
fd5c1a3
6b972fe
 
 
fd5c1a3
6b972fe
 
 
 
 
 
 
 
 
 
 
 
fd5c1a3
6b972fe
 
 
fd5c1a3
6b972fe
 
 
fd5c1a3
6b972fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5c1a3
6b972fe
 
 
 
 
 
 
 
 
 
 
 
fd5c1a3
6b972fe

import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# Remove GPU decorator since we are CPU-only
def predict(message, history):
    # Load model and tokenizer on CPU
    model_id = "kurakurai/Luth-0.6B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="cpu",       # CPU only
        torch_dtype=torch.float16,
        trust_remote_code=True,
        load_in_4bit=False      # 4-bit quantization not supported on CPU
    )

    # Format conversation history for chat template
    messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg} 
                for conv in history for i, msg in enumerate(conv) if msg]
    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True
    ).to('cpu')  # CPU device
    
    # Setup streamer for real-time output
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    
    # Generation parameters
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.3,
        min_p=0.15,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Start generation in separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    # Stream tokens
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message

# Setup Gradio interface
gr.ChatInterface(
    predict,
    description="""
    <center><h2>Kurakura AI Luth-0.6B-Instruct Chat</h2></center>
    
    Chat with [Luth-0.6B-Instruct](https://huggingface.co/kurakurai/Luth-0.6B-Instruct), a French-tuned version of Qwen3-0.6B.
    """,
    examples=[
        "Peux-tu résoudre l'équation 3x - 7 = 11 pour x ?",
        "Explique la photosynthèse en termes simples.",
        "Écris un petit poème sur l'intelligence artificielle."
    ],
    theme=gr.themes.Soft(primary_hue="purple"),
).launch()