File size: 2,105 Bytes
a030e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import re
import time
import json
import urllib.parse
import concurrent.futures
from urllib.parse import urlparse, parse_qs
from utils.prompts import DESCRIPTION_SYSTEM_PROMPT,DESCRIPTION_USER_PROMPT
from aws_llm import llm_response
from utils.transcript import fetch_transcript

def get_thumbnail_url(video_href: str) -> str:
    parsed = urllib.parse.urlparse(video_href)
    query = urllib.parse.parse_qs(parsed.query)
    video_id = query.get("v", [None])[0]
    if video_id:
        return f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
    return None

def parse_video_id(url: str) -> str:
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    if "v" not in qs or not qs["v"]:
        raise ValueError(f"No video ID in URL: {url}")
    return qs["v"][0]


def duration_in_range(raw_duration: str, min_min: int = 2, max_min: int = 80) -> bool:
    try:
        parts = [int(p) for p in raw_duration.split(":")]
        if len(parts) == 3:
            hrs, mins, secs = parts
            total = hrs * 60 + mins + secs / 60
        elif len(parts) == 2:
            mins, secs = parts
            total = mins + secs / 60
        else:
            return False
        return min_min <= total <= max_min
    except ValueError:
        return False
    
def sanitize_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', "_", name).strip()


def process_video_data(vid_data):
    try:
        vid_id = vid_data["video_id"]
        title = vid_data["title"]
        transcript = ""
        try:
            transcript = fetch_transcript(vid_id)
        except Exception as e:
            print(f"⚠️ Transcript failed for {title}: {e}")
        material = transcript.strip() or title
        user_prompt = DESCRIPTION_USER_PROMPT.format(material=material)
        description, cost = llm_response(DESCRIPTION_SYSTEM_PROMPT, user_prompt)
        return {**vid_data, "transcript": transcript, "description": description, "llm_cost": cost}
    except Exception as e:
        print(f"Error processing video {vid_data.get('title', 'unknown')}: {e}")
        return vid_data