Nightwing11 commited on
Commit
6671679
·
1 Parent(s): 7f23ebc

Solved loading transcripts

Browse files
Files changed (1) hide show
  1. Data/yt_transcript.py +31 -12
Data/yt_transcript.py CHANGED
@@ -56,29 +56,48 @@ def fetch_yt_transcript(video_ids):
56
 
57
 
58
  def all_video_transcript_pipeline():
 
 
 
 
 
59
  video_links_list, new_video_added, new_videos_link = video_links_main()
60
  video_transcripts = {}
61
 
62
- # First load existing transcripts
63
- transcripts_folder = "transcripts"
64
  if os.path.exists(transcripts_folder):
65
  existing_files = os.listdir(transcripts_folder)
 
 
66
  for file in existing_files:
67
- video_id = file.split("_")[0]
68
- with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
69
- transcript_text = f.read().splitlines()
70
- video_transcripts[video_id] = {
71
- 'text': transcript_text,
72
- 'file_path': os.path.join(transcripts_folder, file)
73
- }
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Then fetch new transcripts if there are any
76
- if new_video_added:
77
- print("New videos has been added... Fetching transcript for new videos only")
78
- new_transcripts = fetch_yt_transcript(new_videos_link)
 
79
  # Merge new transcripts with existing ones
80
  video_transcripts.update(new_transcripts)
 
81
 
 
82
  return video_transcripts
83
 
84
 
 
56
 
57
 
58
  def all_video_transcript_pipeline():
59
+ # Get the Data directory path
60
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
61
+ transcripts_folder = os.path.join(CURRENT_DIR, "transcripts")
62
+
63
+ print(f"Looking for transcripts in: {transcripts_folder}")
64
  video_links_list, new_video_added, new_videos_link = video_links_main()
65
  video_transcripts = {}
66
 
67
+ # Always load existing transcripts
 
68
  if os.path.exists(transcripts_folder):
69
  existing_files = os.listdir(transcripts_folder)
70
+ print(f"Found {len(existing_files)} files in transcripts folder")
71
+
72
  for file in existing_files:
73
+ if file.endswith('.txt'): # Make sure we only process text files
74
+ video_id = file.split("_")[0]
75
+ file_path = os.path.join(transcripts_folder, file)
76
+ try:
77
+ with open(file_path, "r", encoding="utf-8") as f:
78
+ transcript_text = f.read().splitlines()
79
+ video_transcripts[video_id] = {
80
+ 'text': transcript_text,
81
+ 'file_path': file_path
82
+ }
83
+ print(f"Loaded transcript for video: {video_id}")
84
+ except Exception as e:
85
+ print(f"Error loading transcript {file}: {e}")
86
+ else:
87
+ print(f"Transcripts folder not found at: {transcripts_folder}")
88
+ os.makedirs(transcripts_folder)
89
+ print(f"Created transcripts folder at: {transcripts_folder}")
90
 
91
  # Then fetch new transcripts if there are any
92
+ if new_video_added and new_videos_link:
93
+ print("New videos have been added... Fetching transcripts for new videos")
94
+ new_video_ids = [url.split("v=")[1] for url in new_videos_link] # Extract video IDs
95
+ new_transcripts = fetch_yt_transcript(new_video_ids)
96
  # Merge new transcripts with existing ones
97
  video_transcripts.update(new_transcripts)
98
+ print(f"Added {len(new_transcripts)} new transcripts")
99
 
100
+ print(f"Total transcripts loaded: {len(video_transcripts)}")
101
  return video_transcripts
102
 
103