Angel commited on
Commit
fe1196c
·
2 Parent(s): 35be24f 927f731

Merge pull request #5 from Angel-dash/yt_transcript

Browse files

changes made to yt transcirpt fetching and then adding

Files changed (2) hide show
  1. Data/yt_transcript.py +21 -21
  2. requirements.txt +2 -0
Data/yt_transcript.py CHANGED
@@ -55,29 +55,29 @@ def fetch_yt_transcript(video_ids):
55
 
56
 
57
  def all_video_transcript_pipeline():
58
- video_links_list, new_video_added,new_videos_link = video_links_main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  if new_video_added:
60
  print("New videos has been added... Fetching transcript for new videos only")
61
- new_videos_id = get_video_id(new_videos_link)
62
- video_transcripts = fetch_yt_transcript(new_videos_link)
63
- else:
64
- print("No new video detected, Loading existing transcript from folders ")
65
- video_transcripts = {}
66
-
67
- transcripts_folder = "transcripts"
68
- if os.path.exists(transcripts_folder):
69
- existing_files = os.listdir(transcripts_folder)
70
- for file in existing_files:
71
- video_id = file.split("_")[0]
72
- with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
73
- transcript_text = f.read().splitlines()
74
- video_transcripts[video_id] = {
75
- 'text': transcript_text,
76
- 'file_path': os.path.join(transcripts_folder, file)
77
- }
78
- print(f"Loaded {len(video_transcripts)} transcripts from {transcripts_folder}")
79
- else:
80
- print("No transcript found")
81
  return video_transcripts
82
 
83
 
 
55
 
56
 
57
  def all_video_transcript_pipeline():
58
+ video_links_list, new_video_added, new_videos_link = video_links_main()
59
+ video_transcripts = {}
60
+
61
+ # First load existing transcripts
62
+ transcripts_folder = "transcripts"
63
+ if os.path.exists(transcripts_folder):
64
+ existing_files = os.listdir(transcripts_folder)
65
+ for file in existing_files:
66
+ video_id = file.split("_")[0]
67
+ with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
68
+ transcript_text = f.read().splitlines()
69
+ video_transcripts[video_id] = {
70
+ 'text': transcript_text,
71
+ 'file_path': os.path.join(transcripts_folder, file)
72
+ }
73
+
74
+ # Then fetch new transcripts if there are any
75
  if new_video_added:
76
  print("New videos has been added... Fetching transcript for new videos only")
77
+ new_transcripts = fetch_yt_transcript(new_videos_link)
78
+ # Merge new transcripts with existing ones
79
+ video_transcripts.update(new_transcripts)
80
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return video_transcripts
82
 
83
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ requests~=2.32.3
2
+ python-dotenv~=1.0.1