Spaces:

Nightwing11
/

Hubermanbot2

Sleeping

App Files Files Community

Nightwing11 commited on Jan 8

Commit

6671679

1 Parent(s): 7f23ebc

Solved loading transcripts

Browse files

Files changed (1) hide show

Data/yt_transcript.py +31 -12

Data/yt_transcript.py CHANGED Viewed

@@ -56,29 +56,48 @@ def fetch_yt_transcript(video_ids):
 def all_video_transcript_pipeline():
     video_links_list, new_video_added, new_videos_link = video_links_main()
     video_transcripts = {}
-    # First load existing transcripts
-    transcripts_folder = "transcripts"
     if os.path.exists(transcripts_folder):
         existing_files = os.listdir(transcripts_folder)
         for file in existing_files:
-            video_id = file.split("_")[0]
-            with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
-                transcript_text = f.read().splitlines()
-            video_transcripts[video_id] = {
-                'text': transcript_text,
-                'file_path': os.path.join(transcripts_folder, file)
-            }
     # Then fetch new transcripts if there are any
-    if new_video_added:
-        print("New videos has been added... Fetching transcript for new videos only")
-        new_transcripts = fetch_yt_transcript(new_videos_link)
         # Merge new transcripts with existing ones
         video_transcripts.update(new_transcripts)
     return video_transcripts

 def all_video_transcript_pipeline():
+    # Get the Data directory path
+    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+    transcripts_folder = os.path.join(CURRENT_DIR, "transcripts")
+    print(f"Looking for transcripts in: {transcripts_folder}")
     video_links_list, new_video_added, new_videos_link = video_links_main()
     video_transcripts = {}
+    # Always load existing transcripts
     if os.path.exists(transcripts_folder):
         existing_files = os.listdir(transcripts_folder)
+        print(f"Found {len(existing_files)} files in transcripts folder")
         for file in existing_files:
+            if file.endswith('.txt'):  # Make sure we only process text files
+                video_id = file.split("_")[0]
+                file_path = os.path.join(transcripts_folder, file)
+                try:
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        transcript_text = f.read().splitlines()
+                    video_transcripts[video_id] = {
+                        'text': transcript_text,
+                        'file_path': file_path
+                    }
+                    print(f"Loaded transcript for video: {video_id}")
+                except Exception as e:
+                    print(f"Error loading transcript {file}: {e}")
+    else:
+        print(f"Transcripts folder not found at: {transcripts_folder}")
+        os.makedirs(transcripts_folder)
+        print(f"Created transcripts folder at: {transcripts_folder}")
     # Then fetch new transcripts if there are any
+    if new_video_added and new_videos_link:
+        print("New videos have been added... Fetching transcripts for new videos")
+        new_video_ids = [url.split("v=")[1] for url in new_videos_link]  # Extract video IDs
+        new_transcripts = fetch_yt_transcript(new_video_ids)
         # Merge new transcripts with existing ones
         video_transcripts.update(new_transcripts)
+        print(f"Added {len(new_transcripts)} new transcripts")
+    print(f"Total transcripts loaded: {len(video_transcripts)}")
     return video_transcripts