faster-whisper-transcription-api

Running

App Files Files Community

baoyin2024 commited on Jun 1

Commit

0666a2d

verified ·

1 Parent(s): d1f434b

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -124

app.py CHANGED Viewed

@@ -1,156 +1,202 @@
-from flask import Flask, request, jsonify
-import os
 import io
-import whisperx
-import torchaudio
-import gc
-import tempfile
-import ffmpeg
-from datetime import datetime
 from threading import Semaphore
-app = Flask(__name__)
-# 从环境变量中读取 API_KEY
-api_key = os.environ.get("API_KEY")
-if not api_key:
-    print("Error: API_KEY environment variable not set!")
-# 信号量，用于限制并发请求的数量
 MAX_CONCURRENT_REQUESTS = 2
 request_semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)
-# GPU device
-device = "cuda"
-compute_type = "float16"
 def validate_api_key(request):
-    """
-    验证 API Key.  从 request header 读取 API Key，并与环境变量中的 API Key 进行比较。
-    Args:
-        request: Flask request 对象.
-    Returns:
-        True 如果 API Key 有效，否则 False.
-    """
-    api_key_header = request.headers.get("X-API-Key")
-    api_key_query = request.args.get("api_key")
-    api_key_form = request.form.get("api_key")
-    api_key_env = os.environ.get("API_KEY")
-    if not api_key_env:
-        return False, "API_KEY environment variable not set"
-    if api_key_header == api_key_env or api_key_query == api_key_env or api_key_form == api_key_env:
-        return True, None
-    else:
-        return False, "Invalid API Key"
-@app.route("/whisper_transcribe", methods=["POST"])
-def whisper_transcribe():
-    is_valid, message = validate_api_key(request)  # 验证 API Key
-    if not is_valid:
-        return jsonify({"error": message}), 401
-    with request_semaphore:
-        if 'file' not in request.files:
-            return jsonify({'error': 'No file uploaded'}), 400
-        file = request.files['file']
-        if file.filename == '':
-            return jsonify({'error': 'No file selected'}), 400
-        filename = file.filename
-        file_extension = filename.rsplit('.', 1)[1].lower()
-        allowed_extensions = {'mp3', 'wav', 'ogg', 'm4a', 'flac', 'aac', 'wma', 'opus', 'aiff', 'mp4', 'avi', 'mov',
-                              'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
-        if file_extension not in allowed_extensions:
-            return jsonify({'error': f'Invalid file format. Supported: {", ".join(allowed_extensions)}'}), 400
-        try:
-            # Save the uploaded file to a temporary file
-            with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_extension}') as temp_file:
-                file.save(temp_file.name)
-                temp_file_path = temp_file.name
-            # Determine if the file is a video file
-            video_extensions = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
-            if file_extension in video_extensions:
-                file_type = "video"
-                try:
-                    # Extract audio from video using ffmpeg
-                    audio_file_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-                    ffmpeg.input(temp_file_path).output(audio_file_path, format='wav', acodec='pcm_s16le').run(quiet=True, overwrite_output=True)
-                except Exception as e:
-                    return jsonify({'error': f'Failed to extract audio from video: {str(e)}'}), 500
-                # Delete the temporary video file
-                os.remove(temp_file_path)
-                audio_file_path_final = audio_file_path
-            else:
-                file_type = "audio"
-                audio_file_path_final = temp_file_path
-            # Load the audio file
-            try:
-                audio, samplerate = torchaudio.load(audio_file_path_final)
-                audio = audio.to(device)
-                if audio.shape[0] > 1:
-                    audio = audio.mean(dim=0, keepdim=True)
-                audio = audio.squeeze()
-                if samplerate != 16000:
-                    audio = torchaudio.functional.resample(audio, samplerate, 16000)
-            except Exception as e:
-                return jsonify({'error': f'Failed to load audio file: {str(e)}'}), 500
-            # Ensure the audio duration does not exceed 10 minutes
-            max_duration = 10 * 60  # 10 minutes in seconds
-            if audio.shape[-1] / 16000 > max_duration:
-                return jsonify({'error': 'Audio duration exceeds the maximum allowed duration of 10 minutes'}), 400
-            # Perform transcription
-            try:
-                wmodel, model_options = get_model()
-                segments, info = wmodel.transcribe(audio, batch_size=model_options.get("batch_size", None))
-                segments = list(segments)  # Convert generator to list
-                transcription = ""
-                for segment in segments:
-                    transcription += segment.text
-            except Exception as e:
-                return jsonify({'error': f'Transcription failed: {str(e)}'}), 500
-            finally:
-                # Clean up temporary files
-                os.remove(audio_file_path_final)
-                gc.collect()
-                torch.cuda.empty_cache()
-            return jsonify({'transcription': transcription, 'file_type': file_type}), 200
-        except Exception as e:
-            return jsonify({'error': str(e)}), 500
-@app.route("/health", methods=["GET"])
-def health_check():
-    return jsonify({"status": "healthy"}), 200
-@app.route("/status/busy", methods=["GET"])
-def status_busy():
-    return jsonify({"busy": request_semaphore._value == 0}), 200
-def get_model():
-    """Load model"""
-    model_name = "guillaumekln/faster-whisper-large-v2"
-    model_options = {"beam_size": 5}
-    wmodel = whisperx.load_model(model_name, device, compute_type=compute_type)
-    return wmodel, model_options
-if __name__ == "__main__":
-    app.run(debug=True, port=int(os.environ.get("PORT", 7860)))

+from flask import Flask, request, jsonify, Response
+from faster_whisper import WhisperModel
+import torch
 import io
+import time
+import datetime
 from threading import Semaphore
+import os
+from werkzeug.utils import secure_filename
+import tempfile
+from moviepy.editor import VideoFileClip
+import logging
+import torchaudio  # Import torchaudio
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+app = Flask(__name__)
+# Configuration
 MAX_CONCURRENT_REQUESTS = 2
+MAX_FILE_DURATION = 60 * 30
+TEMPORARY_FOLDER = tempfile.gettempdir()
+ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'ogg', 'm4a', 'flac', 'aac', 'wma', 'opus', 'aiff'}
+ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
+ALLOWED_EXTENSIONS = ALLOWED_AUDIO_EXTENSIONS.union(ALLOWED_VIDEO_EXTENSIONS)
+API_KEY = os.environ.get("API_KEY")  # Load API key from environment
+MODEL_NAME = os.environ.get("WHISPER_MODEL", "guillaumekln/faster-whisper-large-v2")  # Configurable model
+# Device check for faster-whisper
+device = "cuda" if torch.cuda.is_available() else "cpu"
+compute_type = "float16" if device == "cuda" else "int8"
+logging.info(f"Using device: {device} with compute_type: {compute_type}")
+# Faster Whisper setup
+beamsize = 2
+try:
+    wmodel = WhisperModel(
+        MODEL_NAME,
+        device=device,
+        compute_type=compute_type,
+        download_root="./model_cache"
+    )
+    logging.info(f"Model {MODEL_NAME} loaded successfully.")
+except Exception as e:
+    logging.error(f"Failed to load model {MODEL_NAME}: {e}")
+    wmodel = None
+# Concurrency control
 request_semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)
+active_requests = 0
 def validate_api_key(request):
+    api_key = request.headers.get('X-API-Key')
+    if api_key == API_KEY:
+        return True
+    else:
+        return False
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def cleanup_temp_files(*file_paths):
+    for file_path in file_paths:
+        try:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)
+                logging.info(f"Deleted temporary file: {file_path}")
+        except Exception as e:
+            logging.error(f"Error cleaning up temp file {file_path}: {str(e)}")
+def extract_audio_from_video(video_path, output_audio_path):
+    try:
+        video = VideoFileClip(video_path)
+        if video.duration > MAX_FILE_DURATION:
+            video.close()
+            raise ValueError(f"Video duration exceeds {MAX_FILE_DURATION} seconds")
+        video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')  # Specify codec
+        video.close()
+        return output_audio_path
+    except Exception as e:
+        logging.exception("Error extracting audio from video")
+        raise Exception(f"Failed to extract audio from video: {str(e)}")
+@app.route("/health", methods=["GET"])
+def health_check():
+    return jsonify({
+        'status': 'API is running',
+        'timestamp': datetime.datetime.now().isoformat(),
+        'device': device,
+        'compute_type': compute_type,
+        'active_requests': active_requests,
+        'max_duration_supported': MAX_FILE_DURATION,
+        'supported_formats': list(ALLOWED_EXTENSIONS),
+        'model': MODEL_NAME
+    })
+@app.route("/status/busy", methods=["GET"])
+def server_busy():
+    is_busy = active_requests >= MAX_CONCURRENT_REQUESTS
+    return jsonify({
+        'is_busy': is_busy,
+        'active_requests': active_requests,
+        'max_capacity': MAX_CONCURRENT_REQUESTS
+    })
+@app.route("/whisper_transcribe", methods=["POST"])
+def transcribe():
+    global active_requests
+    if not validate_api_key(request):
+        return jsonify({'error': 'Invalid API key'}), 401
+    if not request_semaphore.acquire(blocking=False):
+        return jsonify({'error': 'Server busy'}), 503
+    active_requests += 1
+    start_time = time.time()
+    temp_file_path = None
+    temp_audio_path = None
+    try:
+        if wmodel is None:
+            return jsonify({'error': 'Model failed to load. Check server logs.'}), 500
+        if 'file' not in request.files:
+            return jsonify({'error': 'No file provided'}), 400
+        file = request.files['file']
+        if not (file and allowed_file(file.filename)):
+            return jsonify({'error': f'Invalid file format. Supported: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
+        # Save uploaded file to temporary location
+        temp_file_path = os.path.join(TEMPORARY_FOLDER, secure_filename(file.filename))
+        file.save(temp_file_path)
+        # Check if file is a video and extract audio if necessary
+        file_extension = file.filename.rsplit('.', 1)[1].lower()
+        is_video = file_extension in ALLOWED_VIDEO_EXTENSIONS
+        if is_video:
+            temp_audio_path = os.path.join(TEMPORARY_FOLDER, f"temp_audio_{int(time.time())}.wav")
+            extract_audio_from_video(temp_file_path, temp_audio_path)
+            transcription_file = temp_audio_path
+        else:
+            transcription_file = temp_file_path
+            # Check audio file duration directly
+            try:
+                info = torchaudio.info(transcription_file)
+                duration = info.num_frames / info.sample_rate
+                if duration > MAX_FILE_DURATION:
+                    raise ValueError(f"Audio duration exceeds {MAX_FILE_DURATION} seconds")
+            except Exception as duration_err:
+                logging.exception(f"Error getting/checking audio duration for {transcription_file}")
+                return jsonify({'error': f'Error getting/checking audio duration: {str(duration_err)}'}), 400
+        # Transcribe the audio file
+        segments, _ = wmodel.transcribe(
+            transcription_file,
+            beam_size=beamsize,
+            vad_filter=True,
+            without_timestamps=True,
+            compression_ratio_threshold=2.4,
+            word_timestamps=False
+        )
+        full_text = " ".join(segment.text for segment in segments)
+        return jsonify({
+            'transcription': full_text,
+            'file_type': 'video' if is_video else 'audio'
+        }), 200
+    except Exception as e:
+        logging.exception("Exception during transcription process")
+        return jsonify({'error': str(e)}), 500
+    finally:
+        cleanup_temp_files(temp_file_path, temp_audio_path)
+        active_requests -= 1
+        request_semaphore.release()
+        print(f"Processed in {time.time() - start_time:.2f}s (Active: {active_requests})")
+if __name__ == "__main__":
+    # Create temporary folder if it doesn't exist
+    if not os.path.exists(TEMPORARY_FOLDER):
+        os.makedirs(TEMPORARY_FOLDER)
+        logging.info(f"Created temporary folder: {TEMPORARY_FOLDER}")
+    app.run(host="0.0.0.0", port=7860, threaded=True)