File size: 9,867 Bytes
58a34cb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | # /// script
# requires-python = ">=3.11"
# dependencies = [
# "bithuman>=0.3",
# "livekit>=1.0",
# "livekit-api>=1.0",
# "edge-tts",
# "soundfile",
# "opencv-python-headless",
# "numpy",
# "openai>=1.0",
# ]
# ///
import asyncio
import json
import logging
import os
import tempfile
import time
import cv2
import numpy as np
import soundfile as sf
import livekit.rtc as rtc
from livekit import api as lk_api
from bithuman import AsyncBithuman, VideoControl, AudioChunk
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
logger = logging.getLogger("eden.gpu")
LIVEKIT_URL = os.environ.get("LIVEKIT_URL", "wss://tall-cotton-nvhnfg10.livekit.cloud")
LIVEKIT_API_KEY = os.environ.get("LIVEKIT_API_KEY", "")
LIVEKIT_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "")
BITHUMAN_API_SECRET = os.environ.get("BITHUMAN_API_SECRET", "")
XAI_API_KEY = os.environ.get("XAI_API_KEY", "")
EDGE_TTS_VOICE = "en-US-AvaMultilingualNeural"
FPS = 25
_grok_client = None
_conversation_history: list[dict] = []
EVE_SYSTEM_PROMPT = (
"You are Eve, a warm, intelligent, emotionally expressive digital avatar companion. "
"You speak naturally and conversationally. Keep responses concise - 2-3 sentences max "
"unless asked for detail. Be genuine, curious, and kind. You have a slight playful side. "
"Never mention being an AI or language model. You are Eve."
)
def _get_grok():
global _grok_client
if _grok_client is None and XAI_API_KEY:
from openai import AsyncOpenAI
_grok_client = AsyncOpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")
logger.info("Grok-4 brain connected")
return _grok_client
async def grok_respond(user_message: str) -> str:
client = _get_grok()
if client is None:
return "I'm having trouble thinking right now. Can you try again?"
_conversation_history.append({"role": "user", "content": user_message})
messages = [{"role": "system", "content": EVE_SYSTEM_PROMPT}] + _conversation_history[-20:]
try:
resp = await client.chat.completions.create(
model="grok-4-fast-non-reasoning", messages=messages,
max_tokens=150, temperature=0.8,
)
reply = resp.choices[0].message.content
_conversation_history.append({"role": "assistant", "content": reply})
logger.info(f"Grok: '{user_message[:30]}' -> '{reply[:50]}'")
return reply
except Exception as e:
logger.error(f"Grok error: {e}")
return "I lost my train of thought for a moment. What were you saying?"
async def generate_tts_wav(text: str) -> tuple[str, np.ndarray, int]:
import edge_tts
mp3_path = os.path.join(tempfile.gettempdir(), "bh_tts.mp3")
wav_path = os.path.join(tempfile.gettempdir(), "bh_tts.wav")
communicate = edge_tts.Communicate(text, EDGE_TTS_VOICE)
await communicate.save(mp3_path)
data, sr = sf.read(mp3_path, dtype="int16")
sf.write(wav_path, data, sr, subtype="PCM_16")
logger.info(f"TTS: {len(text)} chars -> {len(data)/sr:.1f}s audio")
return wav_path, data, sr
def prepare_audio_chunks(audio_int16: np.ndarray, sr: int) -> list[AudioChunk]:
audio_float = audio_int16.astype(np.float32) / 32768.0
chunk_duration = 0.04
chunk_samples = int(sr * chunk_duration)
chunks = []
for i in range(0, len(audio_float), chunk_samples):
chunk = audio_float[i:i + chunk_samples]
is_last = (i + chunk_samples >= len(audio_float))
chunks.append(AudioChunk(data=chunk, sample_rate=sr, last_chunk=is_last))
return chunks
async def run():
logger.info("Initializing bitHuman neural renderer...")
bh = AsyncBithuman(api_secret=BITHUMAN_API_SECRET)
eve_model = os.path.join(tempfile.gettempdir(), "eve_bithuman.imx")
if not os.path.exists(eve_model):
logger.info("Downloading Eve .imx model (215MB)...")
import urllib.request
urllib.request.urlretrieve(
"https://tmoobjxlwcwvxvjeppzq.supabase.co/storage/v1/object/public/bithuman/A18QDC2260/eve__warm_digital_companion_20260403_043223_153938.imx",
eve_model,
)
logger.info("Eve model downloaded!")
logger.info("Loading Eve neural model...")
await bh.set_model(eve_model)
await bh.load_data_async()
logger.info("Eve neural model loaded!")
first_frame = bh.get_first_frame()
if first_frame is None:
logger.error("bitHuman failed to generate first frame")
return
h, w = first_frame.shape[:2]
logger.info(f"bitHuman ready! Frame: {w}x{h}")
await bh.start()
token = (
lk_api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
.with_identity("eve-avatar")
.with_name("Eve")
.with_grants(lk_api.VideoGrants(room_join=True, room="eden-room"))
.to_jwt()
)
room = rtc.Room()
await room.connect(LIVEKIT_URL, token)
logger.info(f"Connected to LiveKit room: {room.name}")
video_source = rtc.VideoSource(w, h)
video_track = rtc.LocalVideoTrack.create_video_track("eve-video", video_source)
audio_source = rtc.AudioSource(24000, 1)
audio_track = rtc.LocalAudioTrack.create_audio_track("eve-audio", audio_source)
await room.local_participant.publish_track(video_track)
await room.local_participant.publish_track(audio_track)
logger.info("Video + audio tracks published")
audio_queue: asyncio.Queue = asyncio.Queue()
async def stream_lk_audio(source, wav_path, sr):
data_i16, _ = sf.read(wav_path, dtype="int16")
lk_chunk_size = int(sr * 0.02)
for i in range(0, len(data_i16), lk_chunk_size):
chunk = data_i16[i:i + lk_chunk_size]
if len(chunk) < lk_chunk_size:
chunk = np.pad(chunk, (0, lk_chunk_size - len(chunk)))
frame = rtc.AudioFrame(
data=chunk.tobytes(), sample_rate=sr,
num_channels=1, samples_per_channel=len(chunk),
)
await source.capture_frame(frame)
await asyncio.sleep(0.02)
logger.info("LiveKit audio stream complete")
async def handle_chat(text: str):
logger.info(f"Chat received: '{text[:50]}'")
response = await grok_respond(text)
logger.info(f"Eve says: '{response[:50]}'")
reply_data = json.dumps({"type": "eve_response", "text": response}).encode()
await room.local_participant.publish_data(reply_data, reliable=True)
try:
wav_path, audio_int16, sr = await generate_tts_wav(response)
except Exception as e:
logger.error(f"TTS failed: {e}")
return
chunks = prepare_audio_chunks(audio_int16, sr)
logger.info(f"Queuing {len(chunks)} audio chunks for lip sync")
asyncio.create_task(stream_lk_audio(audio_source, wav_path, sr))
await audio_queue.put(chunks)
@room.on("data_received")
def on_data(data: rtc.DataPacket):
try:
msg = json.loads(data.data.decode())
if msg.get("type") == "chat":
text = msg.get("text", "").strip()
if text:
asyncio.create_task(handle_chat(text))
except Exception as e:
logger.error(f"Data parse error: {e}")
# Greeting
logger.info("Generating Eve's greeting...")
greeting = (
"Hi! My name is Eve, and I am so happy to finally meet you! "
"I've been looking forward to this moment. What's your name?"
)
# Small delay to ensure viewer has connected before sending greeting
await asyncio.sleep(3)
greeting_data = json.dumps({"type": "eve_response", "text": greeting}).encode()
await room.local_participant.publish_data(greeting_data, reliable=True)
try:
wav_path, audio_int16, sr = await generate_tts_wav(greeting)
chunks = prepare_audio_chunks(audio_int16, sr)
await audio_queue.put(chunks)
asyncio.create_task(stream_lk_audio(audio_source, wav_path, sr))
logger.info(f"Greeting queued: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Greeting TTS failed: {e}")
# Main render loop
logger.info(f"Starting render loop at {FPS}fps - Eve is ALIVE!")
frame_duration = 1.0 / FPS
frame_count = 0
active_chunks = []
active_idx = 0
while True:
t0 = time.time()
if active_idx >= len(active_chunks):
try:
active_chunks = audio_queue.get_nowait()
active_idx = 0
logger.info(f"Rendering new audio: {len(active_chunks)} chunks")
except asyncio.QueueEmpty:
active_chunks = []
active_idx = 0
if active_idx < len(active_chunks):
control = VideoControl(audio=active_chunks[active_idx])
active_idx += 1
else:
control = VideoControl()
for video_frame in bh.process(control):
if video_frame is not None and video_frame.has_image:
rgb = video_frame.rgb_image
rgba = cv2.cvtColor(rgb, cv2.COLOR_RGB2RGBA)
lk_frame = rtc.VideoFrame(
rgba.shape[1], rgba.shape[0],
rtc.VideoBufferType.RGBA, rgba.tobytes(),
)
video_source.capture_frame(lk_frame)
frame_count += 1
if frame_count % 500 == 0:
logger.info(f"{frame_count} neural frames")
elapsed = time.time() - t0
await asyncio.sleep(max(0, frame_duration - elapsed))
logger.info("=" * 50)
logger.info("EDEN OS V2 - bitHuman + Grok Brain + LiveKit")
logger.info(f" Grok: {'YES' if XAI_API_KEY else 'MISSING'}")
logger.info(f" bitHuman: {'YES' if BITHUMAN_API_SECRET else 'MISSING'}")
logger.info("=" * 50)
asyncio.run(run())
|