faster-whisper-transcription-api

Running

App Files Files Community

baoyin2024 commited on Jun 1

Commit

3215c20

verified ·

1 Parent(s): 0666a2d

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -41

app.py CHANGED Viewed

@@ -10,14 +10,15 @@ from werkzeug.utils import secure_filename
 import tempfile
 from moviepy.editor import VideoFileClip
 import logging
-import torchaudio  # Import torchaudio
-# Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = Flask(__name__)
-# Configuration
 MAX_CONCURRENT_REQUESTS = 2
 MAX_FILE_DURATION = 60 * 30
 TEMPORARY_FOLDER = tempfile.gettempdir()
@@ -25,15 +26,15 @@ ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'ogg', 'm4a', 'flac', 'aac', 'wma', 'o
 ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
 ALLOWED_EXTENSIONS = ALLOWED_AUDIO_EXTENSIONS.union(ALLOWED_VIDEO_EXTENSIONS)
-API_KEY = os.environ.get("API_KEY")  # Load API key from environment
-MODEL_NAME = os.environ.get("WHISPER_MODEL", "guillaumekln/faster-whisper-large-v2")  # Configurable model
-# Device check for faster-whisper
 device = "cuda" if torch.cuda.is_available() else "cpu"
 compute_type = "float16" if device == "cuda" else "int8"
-logging.info(f"Using device: {device} with compute_type: {compute_type}")
-# Faster Whisper setup
 beamsize = 2
 try:
     wmodel = WhisperModel(
@@ -42,12 +43,12 @@ try:
         compute_type=compute_type,
         download_root="./model_cache"
     )
-    logging.info(f"Model {MODEL_NAME} loaded successfully.")
 except Exception as e:
-    logging.error(f"Failed to load model {MODEL_NAME}: {e}")
     wmodel = None
-# Concurrency control
 request_semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)
 active_requests = 0
@@ -70,29 +71,35 @@ def cleanup_temp_files(*file_paths):
         try:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
-                logging.info(f"Deleted temporary file: {file_path}")
         except Exception as e:
-            logging.error(f"Error cleaning up temp file {file_path}: {str(e)}")
 def extract_audio_from_video(video_path, output_audio_path):
     try:
-        video = VideoFileClip(video_path)
         if video.duration > MAX_FILE_DURATION:
             video.close()
-            raise ValueError(f"Video duration exceeds {MAX_FILE_DURATION} seconds")
-        video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')  # Specify codec
         video.close()
         return output_audio_path
     except Exception as e:
-        logging.exception("Error extracting audio from video")
-        raise Exception(f"Failed to extract audio from video: {str(e)}")
 @app.route("/health", methods=["GET"])
 def health_check():
     return jsonify({
-        'status': 'API is running',
         'timestamp': datetime.datetime.now().isoformat(),
         'device': device,
         'compute_type': compute_type,
@@ -118,10 +125,10 @@ def transcribe():
     global active_requests
     if not validate_api_key(request):
-        return jsonify({'error': 'Invalid API key'}), 401
     if not request_semaphore.acquire(blocking=False):
-        return jsonify({'error': 'Server busy'}), 503
     active_requests += 1
     start_time = time.time()
@@ -130,20 +137,20 @@ def transcribe():
     try:
         if wmodel is None:
-            return jsonify({'error': 'Model failed to load. Check server logs.'}), 500
         if 'file' not in request.files:
-            return jsonify({'error': 'No file provided'}), 400
         file = request.files['file']
         if not (file and allowed_file(file.filename)):
-            return jsonify({'error': f'Invalid file format. Supported: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
-        # Save uploaded file to temporary location
         temp_file_path = os.path.join(TEMPORARY_FOLDER, secure_filename(file.filename))
         file.save(temp_file_path)
-        # Check if file is a video and extract audio if necessary
         file_extension = file.filename.rsplit('.', 1)[1].lower()
         is_video = file_extension in ALLOWED_VIDEO_EXTENSIONS
@@ -153,19 +160,32 @@ def transcribe():
             transcription_file = temp_audio_path
         else:
             transcription_file = temp_file_path
-            # Check audio file duration directly
             try:
-                info = torchaudio.info(transcription_file)
-                duration = info.num_frames / info.sample_rate
                 if duration > MAX_FILE_DURATION:
-                    raise ValueError(f"Audio duration exceeds {MAX_FILE_DURATION} seconds")
-            except Exception as duration_err:
-                logging.exception(f"Error getting/checking audio duration for {transcription_file}")
-                return jsonify({'error': f'Error getting/checking audio duration: {str(duration_err)}'}), 400
-        # Transcribe the audio file
         segments, _ = wmodel.transcribe(
             transcription_file,
             beam_size=beamsize,
@@ -182,21 +202,22 @@ def transcribe():
         }), 200
     except Exception as e:
-        logging.exception("Exception during transcription process")
         return jsonify({'error': str(e)}), 500
     finally:
         cleanup_temp_files(temp_file_path, temp_audio_path)
         active_requests -= 1
         request_semaphore.release()
-        print(f"Processed in {time.time() - start_time:.2f}s (Active: {active_requests})")
 if __name__ == "__main__":
-    # Create temporary folder if it doesn't exist
     if not os.path.exists(TEMPORARY_FOLDER):
         os.makedirs(TEMPORARY_FOLDER)
-        logging.info(f"Created temporary folder: {TEMPORARY_FOLDER}")
     app.run(host="0.0.0.0", port=7860, threaded=True)

 import tempfile
 from moviepy.editor import VideoFileClip
 import logging
+import torchaudio
+import ffmpeg  # 导入 ffmpeg-python
+# 配置日志
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = Flask(__name__)
+# 配置
 MAX_CONCURRENT_REQUESTS = 2
 MAX_FILE_DURATION = 60 * 30
 TEMPORARY_FOLDER = tempfile.gettempdir()
 ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
 ALLOWED_EXTENSIONS = ALLOWED_AUDIO_EXTENSIONS.union(ALLOWED_VIDEO_EXTENSIONS)
+API_KEY = os.environ.get("API_KEY")
+MODEL_NAME = os.environ.get("WHISPER_MODEL", "guillaumekln/faster-whisper-large-v2")
+# 设备检查
 device = "cuda" if torch.cuda.is_available() else "cpu"
 compute_type = "float16" if device == "cuda" else "int8"
+logging.info(f"使用设备: {device}，计算类型: {compute_type}")
+# Faster Whisper 设置
 beamsize = 2
 try:
     wmodel = WhisperModel(
         compute_type=compute_type,
         download_root="./model_cache"
     )
+    logging.info(f"模型 {MODEL_NAME} 加载成功.")
 except Exception as e:
+    logging.error(f"加载模型 {MODEL_NAME} 失败: {e}")
     wmodel = None
+# 并发控制
 request_semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)
 active_requests = 0
         try:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
+                logging.info(f"删除临时文件: {file_path}")
         except Exception as e:
+            logging.error(f"删除临时文件 {file_path} 出错: {str(e)}")
 def extract_audio_from_video(video_path, output_audio_path):
     try:
+        # 使用 ffmpeg-python 调用 FFmpeg
+        ffmpeg.input(video_path).output(output_audio_path, acodec='pcm_s16le').run(capture_stdout=True, capture_stderr=True)
+        # or use with more options:
+        # ffmpeg.input(video_path).output(output_audio_path, acodec='pcm_s16le', ar=44100, ac=2).run(capture_stdout=True, capture_stderr=True)
+        # 检查视频时长
+        video = VideoFileClip(video_path) # moviepy
         if video.duration > MAX_FILE_DURATION:
             video.close()
+            raise ValueError(f"视频时长超过 {MAX_FILE_DURATION} 秒")
         video.close()
         return output_audio_path
     except Exception as e:
+        logging.exception("提取视频中的音频出错")
+        raise Exception(f"提取视频中的音频出错: {str(e)}")
 @app.route("/health", methods=["GET"])
 def health_check():
     return jsonify({
+        'status': 'API 正在运行',
         'timestamp': datetime.datetime.now().isoformat(),
         'device': device,
         'compute_type': compute_type,
     global active_requests
     if not validate_api_key(request):
+        return jsonify({'error': '无效的 API 密钥'}), 401
     if not request_semaphore.acquire(blocking=False):
+        return jsonify({'error': '服务器繁忙'}), 503
     active_requests += 1
     start_time = time.time()
     try:
         if wmodel is None:
+            return jsonify({'error': '模型加载失败。请检查服务器日志。'}), 500
         if 'file' not in request.files:
+            return jsonify({'error': '未提供文件'}), 400
         file = request.files['file']
         if not (file and allowed_file(file.filename)):
+            return jsonify({'error': f'无效的文件格式。支持：{", ".join(ALLOWED_EXTENSIONS)}'}), 400
+        # 保存上传的文件到临时位置
         temp_file_path = os.path.join(TEMPORARY_FOLDER, secure_filename(file.filename))
         file.save(temp_file_path)
+        # 检查是否是视频文件，如果是，则提取音频
         file_extension = file.filename.rsplit('.', 1)[1].lower()
         is_video = file_extension in ALLOWED_VIDEO_EXTENSIONS
             transcription_file = temp_audio_path
         else:
             transcription_file = temp_file_path
+            # 检查音频文件时长
             try:
+                # 使用 torchaudio.load 加载音频，并指定格式
+                waveform, sample_rate = torchaudio.load(transcription_file, format=file_extension)
+                duration = waveform.size(1) / sample_rate
                 if duration > MAX_FILE_DURATION:
+                    raise ValueError(f"音频时长超过 {MAX_FILE_DURATION} 秒")
+            except Exception as load_err:
+                logging.exception(f"使用 torchaudio.load 加载音频文件出错: {transcription_file}")
+                try:
+                    # 尝试使用 soundfile 后端加载 (禁用 sox_io)
+                    torchaudio.set_audio_backend("soundfile")  # 强制使用 soundfile 后端
+                    waveform, sample_rate = torchaudio.load(transcription_file)  # 不要指定文件扩展名
+                    duration = waveform.size(1) / sample_rate
+                    if duration > MAX_FILE_DURATION:
+                        raise ValueError(f"音频时长超过 {MAX_FILE_DURATION} 秒")
+                except Exception as soundfile_err:
+                    logging.exception(f"使用 soundfile 后端加载音频文件出错: {transcription_file}")
+                    return jsonify({'error': f'使用两个后端加载音频文件都出错: {str(soundfile_err)}'}), 400
+                finally:
+                    torchaudio.set_audio_backend("default")  # 恢复默认音频后端
+        # 转录音频文件
         segments, _ = wmodel.transcribe(
             transcription_file,
             beam_size=beamsize,
         }), 200
     except Exception as e:
+        logging.exception("转录过程中发生异常")
         return jsonify({'error': str(e)}), 500
     finally:
         cleanup_temp_files(temp_file_path, temp_audio_path)
         active_requests -= 1
         request_semaphore.release()
+        print(f"处理时间：{time.time() - start_time:.2f}s (活动请求：{active_requests})")
 if __name__ == "__main__":
+    # 创建临时文件夹（如果不存在）
     if not os.path.exists(TEMPORARY_FOLDER):
         os.makedirs(TEMPORARY_FOLDER)
+        logging.info(f"创建临时文件夹: {TEMPORARY_FOLDER}")
     app.run(host="0.0.0.0", port=7860, threaded=True)