Update app.py
Browse files
app.py
CHANGED
@@ -41,36 +41,35 @@ class ZephyrLLM(LLM):
|
|
41 |
self.temperature = temperature
|
42 |
|
43 |
def _call(self, prompt, stop=None):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
69 |
else:
|
70 |
-
|
71 |
-
|
72 |
-
else:
|
73 |
-
raise Exception(f"Failed call [{response.status_code}]: {response.text}")
|
74 |
|
75 |
|
76 |
@property
|
|
|
41 |
self.temperature = temperature
|
42 |
|
43 |
def _call(self, prompt, stop=None):
|
44 |
+
# Format as chat message
|
45 |
+
messages = [{"role": "user", "content": prompt}]
|
46 |
+
|
47 |
+
# Apply Zephyr's chat template
|
48 |
+
formatted_prompt = self.tokenizer.apply_chat_template(
|
49 |
+
messages, tokenize=False, add_generation_prompt=True
|
50 |
+
)
|
51 |
+
# Send request to Hugging Face Inference API
|
52 |
+
payload = {
|
53 |
+
"inputs": formatted_prompt,
|
54 |
+
"parameters": {
|
55 |
+
"max_new_tokens": self.max_new_tokens,
|
56 |
+
"temperature": self.temperature
|
57 |
+
}
|
58 |
+
}
|
59 |
+
response = requests.post(self.api_url, headers=self.headers, json=payload)
|
60 |
+
|
61 |
+
if response.status_code == 200:
|
62 |
+
full_response = response.json()[0]["generated_text"]
|
63 |
+
|
64 |
+
# Extract the assistant reply from the full response
|
65 |
+
# After <|assistant|>\n, everything is the model's answer
|
66 |
+
if "<|assistant|>" in full_response:
|
67 |
+
return full_response.split("<|assistant|>")[-1].strip()
|
68 |
+
else:
|
69 |
+
return full_response.strip()
|
70 |
+
|
71 |
else:
|
72 |
+
raise Exception(f"Failed call [{response.status_code}]: {response.text}")
|
|
|
|
|
|
|
73 |
|
74 |
|
75 |
@property
|