Spaces:
Sleeping
Sleeping
Merge pull request #5 from Angel-dash/yt_transcript
Browse fileschanges made to yt transcirpt fetching and then adding
- Data/yt_transcript.py +21 -21
- requirements.txt +2 -0
Data/yt_transcript.py
CHANGED
|
@@ -55,29 +55,29 @@ def fetch_yt_transcript(video_ids):
|
|
| 55 |
|
| 56 |
|
| 57 |
def all_video_transcript_pipeline():
|
| 58 |
-
video_links_list, new_video_added,new_videos_link = video_links_main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
if new_video_added:
|
| 60 |
print("New videos has been added... Fetching transcript for new videos only")
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
video_transcripts = {}
|
| 66 |
-
|
| 67 |
-
transcripts_folder = "transcripts"
|
| 68 |
-
if os.path.exists(transcripts_folder):
|
| 69 |
-
existing_files = os.listdir(transcripts_folder)
|
| 70 |
-
for file in existing_files:
|
| 71 |
-
video_id = file.split("_")[0]
|
| 72 |
-
with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
|
| 73 |
-
transcript_text = f.read().splitlines()
|
| 74 |
-
video_transcripts[video_id] = {
|
| 75 |
-
'text': transcript_text,
|
| 76 |
-
'file_path': os.path.join(transcripts_folder, file)
|
| 77 |
-
}
|
| 78 |
-
print(f"Loaded {len(video_transcripts)} transcripts from {transcripts_folder}")
|
| 79 |
-
else:
|
| 80 |
-
print("No transcript found")
|
| 81 |
return video_transcripts
|
| 82 |
|
| 83 |
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def all_video_transcript_pipeline():
|
| 58 |
+
video_links_list, new_video_added, new_videos_link = video_links_main()
|
| 59 |
+
video_transcripts = {}
|
| 60 |
+
|
| 61 |
+
# First load existing transcripts
|
| 62 |
+
transcripts_folder = "transcripts"
|
| 63 |
+
if os.path.exists(transcripts_folder):
|
| 64 |
+
existing_files = os.listdir(transcripts_folder)
|
| 65 |
+
for file in existing_files:
|
| 66 |
+
video_id = file.split("_")[0]
|
| 67 |
+
with open(os.path.join(transcripts_folder, file), "r", encoding="utf-8") as f:
|
| 68 |
+
transcript_text = f.read().splitlines()
|
| 69 |
+
video_transcripts[video_id] = {
|
| 70 |
+
'text': transcript_text,
|
| 71 |
+
'file_path': os.path.join(transcripts_folder, file)
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Then fetch new transcripts if there are any
|
| 75 |
if new_video_added:
|
| 76 |
print("New videos has been added... Fetching transcript for new videos only")
|
| 77 |
+
new_transcripts = fetch_yt_transcript(new_videos_link)
|
| 78 |
+
# Merge new transcripts with existing ones
|
| 79 |
+
video_transcripts.update(new_transcripts)
|
| 80 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
return video_transcripts
|
| 82 |
|
| 83 |
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
requests~=2.32.3
|
| 2 |
+
python-dotenv~=1.0.1
|