Defender117 commited on
Commit
b6204d2
·
verified ·
1 Parent(s): 4b1420b

Upload 5 files

Browse files
Files changed (3) hide show
  1. GenerateAIPodcast.py +25 -0
  2. app.py +11 -19
  3. crawl_archive.py +136 -0
GenerateAIPodcast.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import the required module
2
+ import pyttsx3
3
+ from numpy.ma.core import concatenate
4
+
5
+
6
+
7
+ def generateMp3(text, number):
8
+ # Initialize the Pyttsx3 engine
9
+ pyttsx3.init(driverName='sapi5')
10
+ engine = pyttsx3.init()
11
+
12
+ voices = engine.getProperty('voices')
13
+
14
+ engine.setProperty('voice', voices[2].id)
15
+
16
+ newVoiceRate = 145
17
+ engine.setProperty('rate', newVoiceRate)
18
+
19
+ # We can use file extension as mp3 and wav, both will work
20
+ fileName = "Ttldr - " + str(number) + ".mp3"
21
+ engine.save_to_file(text, fileName)
22
+
23
+ # Wait until above command is not finished.
24
+ engine.runAndWait()
25
+
app.py CHANGED
@@ -1,6 +1,9 @@
1
  from fastapi import FastAPI
2
  from transformers import pipeline
3
 
 
 
 
4
 
5
  from llama_cpp import Llama
6
 
@@ -12,19 +15,6 @@ llm = Llama.from_pretrained(
12
  filename="llama-3.2-1b-instruct-q8_0.gguf",
13
  )
14
 
15
- check = llm.create_chat_completion(
16
- messages = [
17
- {
18
- "role": "user",
19
- "content": "What is the capital of France?"
20
- }
21
- ]
22
- )
23
-
24
- print(check['choices'][0]['message']['content'])
25
-
26
- ## create a new FASTAPI app instance
27
- app=FastAPI()
28
  @app.get("/")
29
  def home():
30
  return {"message":"Hello World"}
@@ -32,18 +22,20 @@ def home():
32
  # Define a function to handle the GET request at `/generate`
33
 
34
 
 
35
  @app.get("/generate")
36
- def generate(text:str):
37
  ## use the pipeline to generate text from given input text
38
  output= llm.create_chat_completion(
39
  messages = [
40
- {
41
- "role": "user",
42
- "content": f"{text}"
43
- }
44
  ]
45
  )
46
 
47
  ## return the generate text in Json reposnfe
48
- return {"output":output[0]['generated_text']}
 
49
 
 
 
1
  from fastapi import FastAPI
2
  from transformers import pipeline
3
 
4
+ import crawl_archive
5
+ import GenerateAIPodcast
6
+
7
 
8
  from llama_cpp import Llama
9
 
 
15
  filename="llama-3.2-1b-instruct-q8_0.gguf",
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @app.get("/")
19
  def home():
20
  return {"message":"Hello World"}
 
22
  # Define a function to handle the GET request at `/generate`
23
 
24
 
25
+
26
  @app.get("/generate")
27
+ def generate(link:str):
28
  ## use the pipeline to generate text from given input text
29
  output= llm.create_chat_completion(
30
  messages = [
31
+ {"role": "system",
32
+ "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
33
+ {"role": "user", "content": f"Please summarize this website: {link}."}
 
34
  ]
35
  )
36
 
37
  ## return the generate text in Json reposnfe
38
+ return output['choices'][0]['message']['content']
39
+
40
 
41
+ crawl_archive.run_tldr_crawler()
crawl_archive.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ from llama_cpp import Llama
4
+ from openai import OpenAI
5
+ from selenium import webdriver
6
+ from selenium.common import WebDriverException
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.common.keys import Keys
9
+ from selenium.webdriver.chrome.service import Service
10
+ import concurrent.futures
11
+ from webdriver_manager.chrome import ChromeDriverManager
12
+ import time
13
+ from datetime import datetime
14
+ import os
15
+ from GenerateAIPodcast import generateMp3
16
+ from btts import generateAudioFile
17
+
18
+ # client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")
19
+
20
+
21
+ '''def make_request(link):
22
+ print("-----------------------------------------------------------------------------------------")
23
+ print("Make Request is called")
24
+ try:
25
+
26
+ completion = client.chat.completions.create(
27
+ model="model-identifier",
28
+
29
+ messages=[
30
+ {"role": "system",
31
+ "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
32
+ {"role": "user", "content": f"Please summarize this website: {link}."}
33
+ ],
34
+ temperature=0.7,
35
+ )
36
+ # print(f"Thread: {completion.choices[0].message}")
37
+ # print("TEST:", completion.choices[0].message)
38
+ message = completion.choices[0].message.content
39
+ return message
40
+ except Exception as e:
41
+ print(f"Thread encountered an error: {e}^")
42
+ '''
43
+
44
+
45
+
46
+ llm = Llama.from_pretrained(
47
+ repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
48
+ filename="llama-3.2-1b-instruct-q8_0.gguf",
49
+ )
50
+
51
+ def generate(link:str):
52
+ ## use the pipeline to generate text from given input text
53
+ output= llm.create_chat_completion(
54
+ messages = [
55
+ {"role": "system",
56
+ "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
57
+ {"role": "user", "content": f"Please summarize this website: {link}."}
58
+ ]
59
+ )
60
+
61
+ ## return the generate text in Json reposnfe
62
+ return output['choices'][0]['message']['content']
63
+
64
+
65
+
66
+ def run_tldr_crawler():
67
+ # Setup Selenium WebDriver
68
+ options = webdriver.ChromeOptions()
69
+ # options.add_argument() # Run in headless mode (no browser UI)
70
+ options.add_argument('--disable-gpu')
71
+ options.add_argument('--no-sandbox')
72
+
73
+ # Initialize the WebDriver
74
+ service = Service(ChromeDriverManager().install())
75
+ driver = webdriver.Chrome(service=service, options=options)
76
+ try:
77
+ date = datetime.today().strftime('%Y-%m-%d')
78
+ print(date)
79
+ # Comment this if you want run this at a weekend
80
+ date = '2025-03-07'
81
+ # Step 1: Navigate to the TLDR archives page
82
+ url = f"https://tldr.tech/tech/{date}"
83
+ driver.get(url)
84
+
85
+ # Wait for the page to load
86
+ time.sleep(2)
87
+
88
+ # Step 3: Extract all links on the new page
89
+ links = driver.find_elements(By.TAG_NAME, 'a')
90
+
91
+ # Collect the href attributes
92
+ # extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]
93
+
94
+ extracted_links = [
95
+ link.get_attribute('href')
96
+ for link in links
97
+ if link.get_attribute('href') is not None and
98
+ not link.get_attribute('href').startswith("https://tldr.tech") and
99
+ not link.get_attribute('href').startswith("https://jobs") and
100
+ not "advertise" in link.get_attribute('href')
101
+ ]
102
+
103
+ # Output the extracted links
104
+ print("Extracted Links:")
105
+ print(len(extracted_links))
106
+ for idx, link in enumerate(extracted_links, start=1):
107
+ print(f"{idx}. {link}")
108
+
109
+ # Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
110
+ max_threads = 4
111
+
112
+ # ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
113
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
114
+ # Für jeden Link in extracted_links wird makerequest aufgerufen
115
+ # enumerate gibt auch den Index zurück, falls du ihn brauchst
116
+ futures = []
117
+ for idx, link in enumerate(extracted_links, start=1):
118
+ future = executor.submit(generate, link)
119
+ futures.append((idx, link, future))
120
+ # print(f"{idx}. {link}")
121
+
122
+ # print(future.result())
123
+
124
+ for idx, link, future in futures:
125
+ result = future.result()
126
+ # print(f"{idx}. {link} - Result {result}")
127
+ asyncio.run(generateAudioFile(result, idx))
128
+
129
+
130
+ except WebDriverException as e:
131
+ print(f"Fehler beim Laden der Seite: {e}")
132
+
133
+
134
+ finally:
135
+ # Close the WebDriver
136
+ driver.quit()