Spaces:
Runtime error
Runtime error
File size: 6,175 Bytes
2a8befa a030e94 ca2c65d 2a8befa a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 97d69f2 a030e94 ca2c65d a030e94 97d69f2 a030e94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import time
import requests
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
# Define a proxies dictionary for requests only
proxies = {
"http": "http://Vv8lHp2g:kMhGaCi9XZ@103.172.84.179:50100",
"https": "http://Vv8lHp2g:kMhGaCi9XZ@103.172.84.179:50100",
}
YTI_API_URL = "https://www.youtube-transcript.io/api/transcripts"
YTI_API_KEYS = [
"681c60baa1baf5a82dd5f382",
"681c624386b69cddd17685ed",
"681c628dde80881429decd76",
"681c62bfa1baf5a82dd5f3b3",
"681c62eade80881429decd7f",
"68244ac37994b78ec23e3089",
"68244c15f0a725b52f52477e",
"68244c6d7994b78ec23e30b4"
]
def fetch_transcript(video_id: str, max_retries: int = 2, backoff_factor: float = 2.0) -> str:
"""
Fetch YouTube transcript with three-tier fallback:
1) YouTubeTranscriptApi (en, hi)
2) youtube-transcript.io API (English)
3) YouTube timedtext web endpoint (English)
Retries each method up to max_retries with exponential backoff.
Returns the full transcript as a single string.
Raises the last exception if all methods fail.
"""
def _retry(fn, *args, **kwargs):
"""Helper to retry a fetch function with exponential backoff."""
for attempt in range(max_retries):
try:
return fn(*args, **kwargs)
except Exception as err:
if attempt < max_retries - 1:
wait = backoff_factor ** attempt
print(f"[Retry] {fn.__name__} failed (attempt {attempt+1}), retrying in {wait}s...")
time.sleep(wait)
else:
# Last attempt: re-raise
raise
# --- Method 1: youtube_transcript_api ---
def _yt_api():
print(f"[Transcript] Trying youtube_transcript_api for video_id={video_id}")
try:
# Try direct get_transcript
segs = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'hi'])
except (TranscriptsDisabled, NoTranscriptFound):
print(f"[Transcript] youtube_transcript_api direct failed, trying list_transcripts for video_id={video_id}")
# Fallback to listing then manual/generate selection
transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
try:
t = transcripts.find_transcript(['en'])
except NoTranscriptFound:
print(f"[Transcript] No English transcript, trying generated Hindi transcript for video_id={video_id}")
t = transcripts.find_generated_transcript(['hi'])
segs = t.fetch()
print(f"[Transcript] youtube_transcript_api succeeded for video_id={video_id}")
return " ".join(s['text'] for s in segs)
try:
return _retry(_yt_api)
except Exception as e1:
print(f"[Fallback 1 failed] {e1}")
print(f"[Transcript] Falling back to youtube-transcript.io API for video_id={video_id}")
# --- Method 2: youtube-transcript.io API ---
def _yti_api():
print(f"[Transcript] Trying youtube-transcript.io API for video_id={video_id}")
last_err = None
for idx, api_key in enumerate(YTI_API_KEYS):
print(f"[Transcript] Using API key {idx+1}/{len(YTI_API_KEYS)}: {api_key}")
try:
resp = requests.post(
YTI_API_URL,
headers={
"Authorization": f"Basic {api_key}",
"Content-Type": "application/json"
},
json={"ids": [video_id]},
proxies=proxies
)
resp.raise_for_status()
data = resp.json()
# Try 'segments' format first
segments = data.get(video_id, {}).get("segments", [])
if segments:
print(f"[Transcript] youtube-transcript.io API succeeded with 'segments' for video_id={video_id}")
return " ".join(seg["text"] for seg in segments)
# Try 'tracks' format (like example.py)
tracks = data.get(video_id, {}).get("tracks", [])
if tracks and tracks[0].get("transcript"):
print(f"[Transcript] youtube-transcript.io API succeeded with 'tracks' for video_id={video_id}")
lines = [entry["text"] for entry in tracks[0].get("transcript", [])]
return " ".join(lines)
print(f"[Transcript] No transcript found in 'segments' or 'tracks' for video_id={video_id} with key {api_key}")
last_err = ValueError("No transcript found in 'segments' or 'tracks'")
except Exception as e:
print(f"[Transcript] API key {api_key} failed: {e}")
last_err = e
raise last_err or Exception("All youtube-transcript.io API keys failed")
try:
return _retry(_yti_api)
except Exception as e2:
print(f"[Fallback 2 failed] {e2}")
print(f"[Transcript] Falling back to YouTube timedtext endpoint for video_id={video_id}")
# --- Method 3: YouTube timedtext endpoint ---
def _yt_timedtext():
print(f"[Transcript] Trying YouTube timedtext endpoint for video_id={video_id}")
# construct the "web" captions endpoint
url = (
f"https://video.google.com/timedtext?"
f"lang=en&v={video_id}"
)
resp = requests.get(url, timeout=10, proxies=proxies)
resp.raise_for_status()
# XML format: <transcript><text start="..." dur="...">...text...</text>...</transcript>
import xml.etree.ElementTree as ET
root = ET.fromstring(resp.text)
if not list(root):
raise ValueError("No captions in timedtext response")
print(f"[Transcript] YouTube timedtext endpoint succeeded for video_id={video_id}")
# combine all text nodes
return " ".join(node.text.strip().replace('\n', ' ') for node in root.findall('text') if node.text)
# final attempt; if this fails it'll raise
return _retry(_yt_timedtext)
|