Spaces:

Wauplin
/

responses.js

Running

App Files Files Community

Wauplin HF Staff commited on 10 days ago

Commit

108cf3f

verified ·

1 Parent(s): 7e04e16

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +2 -2
package.json +0 -2
pnpm-lock.yaml +0 -26
src/routes/landingPageHtml.ts +3 -3
src/routes/responses.ts +69 -67

README.md CHANGED Viewed

@@ -112,7 +112,7 @@ Experience the API through our interactive web interface, adapted from the [open
 ```bash
 # Create demo/.env
 cat > demo/.env << EOF
-MODEL="cohere@CohereLabs/c4ai-command-a-03-2025"
 OPENAI_BASE_URL=http://localhost:3000/v1
 OPENAI_API_KEY=${HF_TOKEN:-<your-huggingface-token>}
 EOF
@@ -181,7 +181,7 @@ responses.js/
 - [x] Multi-turn conversation fixes for text messages + tool calls
 - [x] Correctly return "usage" field
 - [x] MCP support (non-streaming)
-- [ ] MCP support (streaming)
 - [ ] Tools execution (web search, file search, image generation, code interpreter)
 - [ ] Background mode support
 - [ ] Additional API routes (GET, DELETE, CANCEL, LIST responses)

 ```bash
 # Create demo/.env
 cat > demo/.env << EOF
+MODEL="CohereLabs/c4ai-command-a-03-2025"
 OPENAI_BASE_URL=http://localhost:3000/v1
 OPENAI_API_KEY=${HF_TOKEN:-<your-huggingface-token>}
 EOF
 - [x] Multi-turn conversation fixes for text messages + tool calls
 - [x] Correctly return "usage" field
 - [x] MCP support (non-streaming)
+- [x] MCP support (streaming)
 - [ ] Tools execution (web search, file search, image generation, code interpreter)
 - [ ] Background mode support
 - [ ] Additional API routes (GET, DELETE, CANCEL, LIST responses)

package.json CHANGED Viewed

@@ -58,8 +58,6 @@
 	"author": "Hugging Face",
 	"license": "MIT",
 	"dependencies": {
-		"@huggingface/inference": "^4.3.1",
-		"@huggingface/tasks": "^0.19.22",
 		"@modelcontextprotocol/sdk": "^1.15.0",
 		"express": "^4.21.2",
 		"openai": "^5.8.2",

 	"author": "Hugging Face",
 	"license": "MIT",
 	"dependencies": {
 		"@modelcontextprotocol/sdk": "^1.15.0",
 		"express": "^4.21.2",
 		"openai": "^5.8.2",

pnpm-lock.yaml CHANGED Viewed

@@ -8,12 +8,6 @@ importers:
   .:
     dependencies:
-      '@huggingface/inference':
-        specifier: ^4.3.1
-        version: 4.3.1
-      '@huggingface/tasks':
-        specifier: ^0.19.22
-        version: 0.19.22
       '@modelcontextprotocol/sdk':
         specifier: ^1.15.0
         version: 1.15.0
@@ -258,17 +252,6 @@ packages:
     resolution: {integrity: sha512-1+WqvgNMhmlAambTvT3KPtCl/Ibr68VldY2XY40SL1CE0ZXiakFR/cbTspaF5HsnpDMvcYYoJHfl4980NBjGag==}
     engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
-  '@huggingface/inference@4.3.1':
-    resolution: {integrity: sha512-wn5ErcX+HTeAYfNIkgjl6pkzGvTeskKRoCFodSmEfa+SmZnMo0/YDP46Ivnz2JV6DJwMd3naOtgYH6WZVD3qoQ==}
-    engines: {node: '>=18'}
-  '@huggingface/jinja@0.5.0':
-    resolution: {integrity: sha512-Ptc03/jGRiYRoi0bUYKZ14MkDslsBRT24oxmsvUlfYrvQMldrxCevhPnT+hfX8awKTT8/f/0ZBBWldoeAcMHdQ==}
-    engines: {node: '>=18'}
-  '@huggingface/tasks@0.19.22':
-    resolution: {integrity: sha512-jtRXsJZTES01X4gJ5VOUnEm3ONyyfXUcWKObbWkr/SQmjaH/kxtWqc2zVWKaxL4QLoXqXJ+T+Pi5xupMStSudQ==}
   '@humanfs/core@0.19.1':
     resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==}
     engines: {node: '>=18.18.0'}
@@ -1833,15 +1816,6 @@ snapshots:
       '@eslint/core': 0.15.1
       levn: 0.4.1
-  '@huggingface/inference@4.3.1':
-    dependencies:
-      '@huggingface/jinja': 0.5.0
-      '@huggingface/tasks': 0.19.22
-  '@huggingface/jinja@0.5.0': {}
-  '@huggingface/tasks@0.19.22': {}
   '@humanfs/core@0.19.1': {}
   '@humanfs/node@0.16.6':

   .:
     dependencies:
       '@modelcontextprotocol/sdk':
         specifier: ^1.15.0
         version: 1.15.0
     resolution: {integrity: sha512-1+WqvgNMhmlAambTvT3KPtCl/Ibr68VldY2XY40SL1CE0ZXiakFR/cbTspaF5HsnpDMvcYYoJHfl4980NBjGag==}
     engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
   '@humanfs/core@0.19.1':
     resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==}
     engines: {node: '>=18.18.0'}
       '@eslint/core': 0.15.1
       levn: 0.4.1
   '@humanfs/core@0.19.1': {}
   '@humanfs/node@0.16.6':

src/routes/landingPageHtml.ts CHANGED Viewed

@@ -621,7 +621,7 @@ tools = [
 ]
 response = client.responses.create(
-    model="cerebras@meta-llama/Llama-3.3-70B-Instruct",
     tools=tools,
     input="What is the weather like in Boston today?",
     tool_choice="auto",
@@ -645,7 +645,7 @@ class CalendarEvent(BaseModel):
     participants: list[str]
 response = client.responses.parse(
-    model="novita@meta-llama/Meta-Llama-3-70B-Instruct",
     input=[
         {"role": "system", "content": "Extract the event information."},
         {
@@ -668,7 +668,7 @@ client = OpenAI(
 )
 response = client.responses.create(
-    model="cerebras@meta-llama/Llama-3.3-70B-Instruct",
     input="how does tiktoken work?",
     tools=[
         {

 ]
 response = client.responses.create(
+    model="meta-llama/Llama-3.3-70B-Instruct:cerebras",
     tools=tools,
     input="What is the weather like in Boston today?",
     tool_choice="auto",
     participants: list[str]
 response = client.responses.parse(
+    model="meta-llama/Meta-Llama-3-70B-Instruct:novita",
     input=[
         {"role": "system", "content": "Extract the event information."},
         {
 )
 response = client.responses.create(
+    model="meta-llama/Llama-3.3-70B-Instruct:cerebras",
     input="how does tiktoken work?",
     tools=[
         {

src/routes/responses.ts CHANGED Viewed

@@ -2,13 +2,7 @@ import { type Response as ExpressResponse } from "express";
 import { type ValidatedRequest } from "../middleware/validation.js";
 import type { CreateResponseParams, McpServerParams, McpApprovalRequestParams } from "../schemas.js";
 import { generateUniqueId } from "../lib/generateUniqueId.js";
-import { InferenceClient } from "@huggingface/inference";
-import type {
-	ChatCompletionInputMessage,
-	ChatCompletionInputMessageChunkType,
-	ChatCompletionInput,
-} from "@huggingface/tasks";
 import type {
 	Response,
 	ResponseStreamEvent,
@@ -18,9 +12,11 @@ import type {
 	ResponseOutputItem,
 } from "openai/resources/responses/responses";
 import type {
-	ChatCompletionInputFunctionDefinition,
-	ChatCompletionInputTool,
-} from "@huggingface/tasks/dist/commonjs/tasks/chat-completion/inference.js";
 import { callMcpTool, connectMcpServer } from "../mcp.js";
 class StreamingError extends Error {
@@ -163,7 +159,7 @@ async function* innerRunStream(
 	}
 	// List MCP tools from server (if required) + prepare tools for the LLM
-	let tools: ChatCompletionInputTool[] | undefined = [];
 	const mcpToolsMapping: Record<string, McpServerParams> = {};
 	if (req.body.tools) {
 		for (const tool of req.body.tools) {
@@ -213,7 +209,7 @@ async function* innerRunStream(
 									type: "function" as const,
 									function: {
 										name: mcpTool.name,
-										parameters: mcpTool.input_schema,
 										description: mcpTool.description ?? undefined,
 									},
 								});
@@ -232,12 +228,8 @@ async function* innerRunStream(
 	// Prepare payload for the LLM
-	// Resolve model and provider
-	const model = req.body.model.includes("@") ? req.body.model.split("@")[1] : req.body.model;
-	const provider = req.body.model.includes("@") ? req.body.model.split("@")[0] : undefined;
 	// Format input to Chat Completion format
-	const messages: ChatCompletionInputMessage[] = req.body.instructions
 		? [{ role: "system", content: req.body.instructions }]
 		: [];
 	if (Array.isArray(req.body.input)) {
@@ -247,22 +239,20 @@ async function* innerRunStream(
 					switch (item.type) {
 						case "function_call":
 							return {
-								// hacky but best fit for now
-								role: "assistant",
-								name: `function_call ${item.name} ${item.call_id}`,
 								content: item.arguments,
 							};
 						case "function_call_output":
 							return {
-								// hacky but best fit for now
-								role: "assistant",
-								name: `function_call_output ${item.call_id}`,
 								content: item.output,
 							};
 						case "message":
-							return {
-								role: item.role,
-								content:
 									typeof item.content === "string"
 										? item.content
 										: item.content
@@ -270,7 +260,7 @@ async function* innerRunStream(
 													switch (content.type) {
 														case "input_image":
 															return {
-																type: "image_url" as ChatCompletionInputMessageChunkType,
 																image_url: {
 																	url: content.image_url,
 																},
@@ -278,7 +268,7 @@ async function* innerRunStream(
 														case "output_text":
 															return content.text
 																? {
-																		type: "text" as ChatCompletionInputMessageChunkType,
 																		text: content.text,
 																	}
 																: undefined;
@@ -286,72 +276,80 @@ async function* innerRunStream(
 															return undefined;
 														case "input_text":
 															return {
-																type: "text" as ChatCompletionInputMessageChunkType,
 																text: content.text,
 															};
 													}
 												})
-												.filter((item) => item !== undefined),
-							};
 						case "mcp_list_tools": {
-							// Hacky: will be dropped by filter since tools are passed as separate objects
 							return {
-								role: "assistant",
-								name: "mcp_list_tools",
-								content: "",
 							};
 						}
 						case "mcp_call": {
 							return {
-								role: "assistant",
-								name: "mcp_call",
 								content: `MCP call (${item.id}). Server: '${item.server_label}'. Tool: '${item.name}'. Arguments: '${item.arguments}'.`,
 							};
 						}
 						case "mcp_approval_request": {
 							return {
-								role: "assistant",
-								name: "mcp_approval_request",
 								content: `MCP approval request (${item.id}). Server: '${item.server_label}'. Tool: '${item.name}'. Arguments: '${item.arguments}'.`,
 							};
 						}
 						case "mcp_approval_response": {
 							return {
-								role: "assistant",
-								name: "mcp_approval_response",
 								content: `MCP approval response (${item.id}). Approved: ${item.approve}. Reason: ${item.reason}.`,
 							};
 						}
 					}
 				})
-				.filter((message) => message.content?.length !== 0)
 		);
 	} else {
-		messages.push({ role: "user", content: req.body.input });
 	}
 	// Prepare payload for the LLM
-	const payload: ChatCompletionInput = {
 		// main params
-		model,
-		provider,
 		messages,
-		stream: req.body.stream,
 		// options
 		max_tokens: req.body.max_output_tokens === null ? undefined : req.body.max_output_tokens,
 		response_format: req.body.text?.format
-			? {
-					type: req.body.text.format.type,
-					json_schema:
-						req.body.text.format.type === "json_schema"
-							? {
-									description: req.body.text.format.description,
-									name: req.body.text.format.name,
-									schema: req.body.text.format.schema,
-									strict: req.body.text.format.strict,
-								}
-							: undefined,
-				}
 			: undefined,
 		temperature: req.body.temperature,
 		tool_choice:
@@ -475,11 +473,15 @@ async function* listMcpToolsStream(
  */
 async function* handleOneTurnStream(
 	apiKey: string | undefined,
-	payload: ChatCompletionInput,
 	responseObject: IncompleteResponse,
 	mcpToolsMapping: Record<string, McpServerParams>
 ): AsyncGenerator<ResponseStreamEvent> {
-	const stream = new InferenceClient(apiKey).chatCompletionStream(payload);
 	let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
 	let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
 	let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
@@ -565,7 +567,7 @@ async function* handleOneTurnStream(
 			}
 			let currentOutputItem = responseObject.output.at(-1);
-			if (delta.tool_calls[0].function.name) {
 				const functionName = delta.tool_calls[0].function.name;
 				// Tool call with a name => new tool call
 				let newOutputObject:
@@ -594,7 +596,7 @@ async function* handleOneTurnStream(
 					newOutputObject = {
 						type: "function_call",
 						id: generateUniqueId("fc"),
-						call_id: delta.tool_calls[0].id,
 						name: functionName,
 						arguments: "",
 					};
@@ -618,7 +620,7 @@ async function* handleOneTurnStream(
 				}
 			}
-			if (delta.tool_calls[0].function.arguments) {
 				// Current item is necessarily a tool call
 				currentOutputItem = responseObject.output.at(-1) as
 					| ResponseOutputItem.McpCall
@@ -737,7 +739,7 @@ async function* handleOneTurnStream(
 								arguments: lastOutputItem.arguments,
 								// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
 								// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
-							} as unknown as ChatCompletionInputFunctionDefinition,
 						},
 					],
 				},
@@ -775,7 +777,7 @@ async function* callApprovedMCPToolStream(
 	approvalRequest: McpApprovalRequestParams | undefined,
 	mcpToolsMapping: Record<string, McpServerParams>,
 	responseObject: IncompleteResponse,
-	payload: ChatCompletionInput
 ): AsyncGenerator<ResponseStreamEvent> {
 	if (!approvalRequest) {
 		throw new Error(`MCP approval request '${approval_request_id}' not found`);
@@ -842,7 +844,7 @@ async function* callApprovedMCPToolStream(
 						arguments: outputObject.arguments,
 						// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
 						// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
-					} as unknown as ChatCompletionInputFunctionDefinition,
 				},
 			],
 		},

 import { type ValidatedRequest } from "../middleware/validation.js";
 import type { CreateResponseParams, McpServerParams, McpApprovalRequestParams } from "../schemas.js";
 import { generateUniqueId } from "../lib/generateUniqueId.js";
+import { OpenAI } from "openai";
 import type {
 	Response,
 	ResponseStreamEvent,
 	ResponseOutputItem,
 } from "openai/resources/responses/responses";
 import type {
+	ChatCompletionCreateParamsStreaming,
+	ChatCompletionMessageParam,
+	ChatCompletionTool,
+} from "openai/resources/chat/completions.js";
+import type { FunctionParameters } from "openai/resources/shared.js";
 import { callMcpTool, connectMcpServer } from "../mcp.js";
 class StreamingError extends Error {
 	}
 	// List MCP tools from server (if required) + prepare tools for the LLM
+	let tools: ChatCompletionTool[] | undefined = [];
 	const mcpToolsMapping: Record<string, McpServerParams> = {};
 	if (req.body.tools) {
 		for (const tool of req.body.tools) {
 									type: "function" as const,
 									function: {
 										name: mcpTool.name,
+										parameters: mcpTool.input_schema as FunctionParameters,
 										description: mcpTool.description ?? undefined,
 									},
 								});
 	// Prepare payload for the LLM
 	// Format input to Chat Completion format
+	const messages: ChatCompletionMessageParam[] = req.body.instructions
 		? [{ role: "system", content: req.body.instructions }]
 		: [];
 	if (Array.isArray(req.body.input)) {
 					switch (item.type) {
 						case "function_call":
 							return {
+								role: "tool" as const,
 								content: item.arguments,
+								tool_call_id: item.call_id,
 							};
 						case "function_call_output":
 							return {
+								role: "tool" as const,
 								content: item.output,
+								tool_call_id: item.call_id,
 							};
 						case "message":
+						case undefined:
+							if (item.role === "assistant" || item.role === "user" || item.role === "system") {
+								const content =
 									typeof item.content === "string"
 										? item.content
 										: item.content
 													switch (content.type) {
 														case "input_image":
 															return {
+																type: "image_url" as const,
 																image_url: {
 																	url: content.image_url,
 																},
 														case "output_text":
 															return content.text
 																? {
+																		type: "text" as const,
 																		text: content.text,
 																	}
 																: undefined;
 															return undefined;
 														case "input_text":
 															return {
+																type: "text" as const,
 																text: content.text,
 															};
 													}
 												})
+												.filter((item) => {
+													return item !== undefined;
+												});
+								return {
+									role: item.role,
+									content,
+								} as ChatCompletionMessageParam;
+							}
+							return undefined;
 						case "mcp_list_tools": {
 							return {
+								role: "tool" as const,
+								content: "MCP list tools. Server: '${item.server_label}'.",
+								tool_call_id: "mcp_list_tools",
 							};
 						}
 						case "mcp_call": {
 							return {
+								role: "tool" as const,
 								content: `MCP call (${item.id}). Server: '${item.server_label}'. Tool: '${item.name}'. Arguments: '${item.arguments}'.`,
+								tool_call_id: "mcp_call",
 							};
 						}
 						case "mcp_approval_request": {
 							return {
+								role: "tool" as const,
 								content: `MCP approval request (${item.id}). Server: '${item.server_label}'. Tool: '${item.name}'. Arguments: '${item.arguments}'.`,
+								tool_call_id: "mcp_approval_request",
 							};
 						}
 						case "mcp_approval_response": {
 							return {
+								role: "tool" as const,
 								content: `MCP approval response (${item.id}). Approved: ${item.approve}. Reason: ${item.reason}.`,
+								tool_call_id: "mcp_approval_response",
 							};
 						}
 					}
 				})
+				.filter(
+					(message): message is NonNullable<typeof message> =>
+						message !== undefined &&
+						(typeof message.content === "string" || (Array.isArray(message.content) && message.content.length !== 0))
+				)
 		);
 	} else {
+		messages.push({ role: "user", content: req.body.input } as const);
 	}
 	// Prepare payload for the LLM
+	const payload: ChatCompletionCreateParamsStreaming = {
 		// main params
+		model: req.body.model,
 		messages,
+		stream: true,
 		// options
 		max_tokens: req.body.max_output_tokens === null ? undefined : req.body.max_output_tokens,
 		response_format: req.body.text?.format
+			? req.body.text.format.type === "json_schema"
+				? {
+						type: "json_schema",
+						json_schema: {
+							description: req.body.text.format.description,
+							name: req.body.text.format.name,
+							schema: req.body.text.format.schema,
+							strict: req.body.text.format.strict,
+						},
+					}
+				: { type: req.body.text.format.type }
 			: undefined,
 		temperature: req.body.temperature,
 		tool_choice:
  */
 async function* handleOneTurnStream(
 	apiKey: string | undefined,
+	payload: ChatCompletionCreateParamsStreaming,
 	responseObject: IncompleteResponse,
 	mcpToolsMapping: Record<string, McpServerParams>
 ): AsyncGenerator<ResponseStreamEvent> {
+	const client = new OpenAI({
+		baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
+		apiKey: apiKey,
+	});
+	const stream = await client.chat.completions.create(payload);
 	let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
 	let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
 	let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
 			}
 			let currentOutputItem = responseObject.output.at(-1);
+			if (delta.tool_calls[0].function?.name) {
 				const functionName = delta.tool_calls[0].function.name;
 				// Tool call with a name => new tool call
 				let newOutputObject:
 					newOutputObject = {
 						type: "function_call",
 						id: generateUniqueId("fc"),
+						call_id: delta.tool_calls[0].id ?? "",
 						name: functionName,
 						arguments: "",
 					};
 				}
 			}
+			if (delta.tool_calls[0].function?.arguments) {
 				// Current item is necessarily a tool call
 				currentOutputItem = responseObject.output.at(-1) as
 					| ResponseOutputItem.McpCall
 								arguments: lastOutputItem.arguments,
 								// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
 								// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
+							},
 						},
 					],
 				},
 	approvalRequest: McpApprovalRequestParams | undefined,
 	mcpToolsMapping: Record<string, McpServerParams>,
 	responseObject: IncompleteResponse,
+	payload: ChatCompletionCreateParamsStreaming
 ): AsyncGenerator<ResponseStreamEvent> {
 	if (!approvalRequest) {
 		throw new Error(`MCP approval request '${approval_request_id}' not found`);
 						arguments: outputObject.arguments,
 						// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
 						// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
+					},
 				},
 			],
 		},