Spaces:

SonicaB
/

Scene-Mood-Classifier

Sleeping

Scene-Mood-Classifier / fusion-app /utils_media.py

Upload folder using huggingface_hub

5ce022b verified 5 days ago

1.46 kB

	from pathlib import Path
	from typing import Tuple, Union
	import io
	import numpy as np
	from PIL import Image
	import ffmpeg
	from pydub import AudioSegment

	# helpers
	def _to_path(p: Union[str, dict, Path]) -> str:
	if isinstance(p, dict):
	return p.get("name") or p.get("path") or p.get("data") or ""
	return str(p)

	def _audiosegment_float32(seg: AudioSegment) -> np.ndarray:
	seg = seg.set_frame_rate(16000).set_channels(1).set_sample_width(2) # 16-bit
	samples = np.array(seg.get_array_of_samples(), dtype=np.int16)
	return (samples.astype(np.float32) / 32768.0)

	# public API
	def video_to_frame_audio(video_in) -> Tuple[Image.Image, np.ndarray]:
	video_path = _to_path(video_in)
	if not video_path:
	raise ValueError("Empty video path")

	try:
	out, _ = (
	ffmpeg
	.input(video_path)
	.output('pipe:', vframes=1, format='image2', vcodec='mjpeg')
	.run(capture_stdout=True, capture_stderr=True)
	)
	frame = Image.open(io.BytesIO(out)).convert("RGB")
	except ffmpeg.Error as e:
	raise RuntimeError(f"ffmpeg frame extract failed: {e.stderr.decode()[:2000]}")


	seg = AudioSegment.from_file(video_path)
	audio16k = _audiosegment_float32(seg)
	return frame, audio16k

	def load_audio_16k(audio_path_like) -> np.ndarray:
	path = _to_path(audio_path_like)
	seg = AudioSegment.from_file(path)
	return _audiosegment_float32(seg)