magdap116 commited on
Commit
6530149
·
verified ·
1 Parent(s): e88b9d6

Upload 2 files

Browse files
Files changed (2) hide show
  1. wikipedia_utils.py +60 -0
  2. youtube_utils.py +24 -0
wikipedia_utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipedia
2
+ import spacy
3
+
4
+ def get_wiki_query(query):
5
+ try:
6
+ ### spacy code
7
+
8
+ # Load the English model
9
+ nlp = spacy.load("en_core_web_sm")
10
+
11
+ # Parse the sentence
12
+ doc = nlp(query)
13
+
14
+
15
+ # Entity path (people, evenrs, books)
16
+ entities_components = [entity_substring.text for entity_substring in doc.ents]
17
+ if len(entities_components) > 0:
18
+ subject_of_the_query= ""
19
+ for substrings in entities_components:
20
+ subject_of_the_query = subject_of_the_query + substrings
21
+
22
+ if subject_of_the_query == "":
23
+ print("Entity query not parsed.")
24
+ return subject_of_the_query
25
+
26
+
27
+
28
+ else:
29
+ first_noun = next((t for t in doc if t.pos_ in {"NOUN", "PROPN"}), None).text
30
+ print("Returning first noun from the query.")
31
+ return first_noun
32
+
33
+
34
+
35
+
36
+ except Exception as e:
37
+ print("Failed parsing a query subject from query" ,query)
38
+ print(e)
39
+
40
+
41
+ def fetch_wikipedia_page(wiki_query):
42
+ try:
43
+ matched_articles = wikipedia.search(wiki_query)
44
+ if len(matched_articles) > 0:
45
+ used_article = matched_articles[0]
46
+ page_content = wikipedia.page(used_article,auto_suggest=False)
47
+ return page_content.content
48
+ else:
49
+ return ""
50
+ except Exception as e:
51
+ print("Could not fetch the wikipedia article using ", wiki_query)
52
+ print(e)
53
+
54
+ test_queries = ["How many albums did Amy Winehouse publish?", "Who is Evora Cesaria?","Is cat an animal?"]
55
+ wiki_queries = []
56
+ wiki_pages= []
57
+ for tq in test_queries:
58
+ wiki_queries.append(get_wiki_query(tq))
59
+ for wq in wiki_queries:
60
+ wiki_pages.append(fetch_wikipedia_page(wq))
youtube_utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api import YouTubeTranscriptApi
2
+ import re
3
+
4
+ def get_youtube_video_id(query):
5
+ try:
6
+ match = re.search(r'(?:youtu\.be/|youtube\.com/(?:watch\?v=|embed/|v/|shorts/))([\w-]{11})', query)
7
+ if match:
8
+ video_id = match.group(1)
9
+ print(video_id)
10
+ return video_id
11
+ except:
12
+ print("Did not find youtube video id from query ", query)
13
+
14
+ def fetch_transcript_english(video_id):
15
+ try:
16
+ ytt_api = YouTubeTranscriptApi()
17
+ transcript = ytt_api.fetch(video_id,languages=['en'])
18
+ return transcript
19
+ except:
20
+ print("Error ")
21
+
22
+ def post_process_transcript(transcript_snippets):
23
+ full_transcript = " ".join([transcript_snippet.text for transcript_snippet in transcript_snippets])
24
+ return full_transcript