Spaces:

thecozietower
/

llm

Sleeping

App Files Files Community

eienmojiki commited on Mar 14

Commit

ca8bea5

verified ·

1 Parent(s): fb14067

Upload 7 files

Browse files

Add FastAPI chat completion service and configuration files

- Implement chat completion API
- Add Dockerfile for containerization
- Include requirements for dependencies
- Configure VSCode settings
- Create .gitignore for project files

Files changed (7) hide show

.gitignore +2 -0
.vscode/settings.json +3 -0
Dockerfile +13 -0
main.py +8 -0
models/chat_completion.py +24 -0
requirements.txt +3 -0
routes/chatCompletion.py +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ test.py
2	+ .venv

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "editor.wordWrap": "on"
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from huggingface_hub import InferenceClient
+from fastapi import FastAPI
+app = FastAPI()
+@app.get("/")
+async def root():
+    return {"message": "Hello World"}

models/chat_completion.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+from huggingface_hub import ChatCompletionInputMessage, ChatCompletionInputGrammarType, ChatCompletionInputStreamOptions, ChatCompletionInputToolChoiceClass, ChatCompletionInputTool
+class ChatRequest(BaseModel):
+    model: str = Field(..., description="The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used. See https://huggingface.co/tasks/text-generation for more details.")
+    messages: List[ChatCompletionInputMessage] = Field(..., description="Conversation history consisting of roles and content pairs.")
+    frequency_penalty: Optional[float] = Field(0.0, ge=-2.0, le=2.0, description="Penalizes new tokens based on their existing frequency in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.")
+    logit_bias: Optional[dict] = Field(None, description="Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens to an associated bias value from -100 to 100.")
+    logprobs: Optional[bool] = Field(None, description="Whether to return log probabilities of the output tokens or not.")
+    max_tokens: Optional[int] = Field(100, description="Maximum number of tokens allowed in the response. Defaults to 100.")
+    n: Optional[int] = Field(None, description="UNUSED.")
+    presence_penalty: Optional[float] = Field(None, ge=-2.0, le=2.0, description="Positive values penalize new tokens based on whether they appear in the text so far.")
+    response_format: Optional[ChatCompletionInputGrammarType] = Field(None, description="Grammar constraints. Can be either a JSONSchema or a regex.")
+    seed: Optional[int] = Field(None, description="Seed for reproducible control flow.")
+    stop: Optional[str] = Field(None, description="Up to four strings which trigger the end of the response.")
+    stream: Optional[bool] = Field(False, description="Enable realtime streaming of responses. Defaults to False.")
+    stream_options: Optional[ChatCompletionInputStreamOptions] = Field(None, description="Options for streaming completions.")
+    temperature: Optional[float] = Field(1.0, ge=0.0, le=2.0, description="Controls randomness of the generations. Lower values ensure less random completions.")
+    top_logprobs: Optional[int] = Field(None, ge=0, le=5, description="Specifying the number of most likely tokens to return at each token position.")
+    top_p: Optional[float] = Field(1.0, ge=0.0, le=1.0, description="Fraction of the most likely next words to sample from.")
+    tool_choice: Optional[ChatCompletionInputToolChoiceClass] = Field("auto", description="The tool to use for the completion. Defaults to 'auto'.")
+    tool_prompt: Optional[str] = Field(None, description="A prompt to be appended before the tools.")
+    tools: Optional[List[ChatCompletionInputTool]] = Field(None, description="A list of tools the model may call.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn[standard]
+huggingface_hub==0.27.1

routes/chatCompletion.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from fastapi import APIRouter
+from fastapi.responses import StreamingResponse
+from models.chat_completion import ChatRequest
+from huggingface_hub import InferenceClient
+import json
+router = APIRouter()
+def generate_stream(response):
+    for chunk in response:
+        yield f"data: {json.dumps(chunk.__dict__, separators=(',', ':'))}\n\n"
+@router.post("/v1/chat-completion", tags=["Chat Completion"])
+async def chat_completion(body: ChatRequest):
+    client = InferenceClient(model=body.model)
+    res = client.chat_completion(
+        messages=body.messages,
+        frequency_penalty=body.frequency_penalty,
+        logit_bias=body.logit_bias,
+        logprobs=body.logprobs,
+        max_tokens=body.max_tokens,
+        n=body.n,
+        presence_penalty=body.presence_penalty,
+        response_format=body.response_format,
+        seed=body.seed,
+        stop=body.stop,
+        stream=body.stream,
+        stream_options=body.stream_options,
+        temperature=body.temperature,
+        top_logprobs=body.top_logprobs,
+        top_p=body.top_p,
+        tool_choice=body.tool_choice,
+        tool_prompt=body.tool_prompt,
+        tools=body.tools
+    )
+    if not body.stream:
+        return json.dumps(res.__dict__, indent=2)
+    else:
+        return StreamingResponse(generate_stream(res), media_type="text/event-stream")