File size: 4,909 Bytes
b6204d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import asyncio

from llama_cpp import Llama
from openai import OpenAI
from selenium import webdriver
from selenium.common import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import concurrent.futures
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime
import os
from GenerateAIPodcast import generateMp3
from btts import generateAudioFile

# client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")


'''def make_request(link):

    print("-----------------------------------------------------------------------------------------")

    print("Make Request is called")

    try:



        completion = client.chat.completions.create(

            model="model-identifier",



            messages=[

                {"role": "system",

                 "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},

                {"role": "user", "content": f"Please summarize this website: {link}."}

            ],

            temperature=0.7,

        )

        # print(f"Thread: {completion.choices[0].message}")

        # print("TEST:", completion.choices[0].message)

        message = completion.choices[0].message.content

        return message

    except Exception as e:

        print(f"Thread  encountered an error: {e}^")

        '''



llm = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
	filename="llama-3.2-1b-instruct-q8_0.gguf",
)

def generate(link:str):
    ## use the pipeline to generate text from given input text
    output= llm.create_chat_completion(
	messages = [
		{"role": "system",
		 "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
		{"role": "user", "content": f"Please summarize this website: {link}."}
	]
)

    ## return the generate text in Json reposnfe
    return output['choices'][0]['message']['content']



def run_tldr_crawler():
    # Setup Selenium WebDriver
    options = webdriver.ChromeOptions()
    # options.add_argument()  # Run in headless mode (no browser UI)
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    try:
        date = datetime.today().strftime('%Y-%m-%d')
        print(date)
        # Comment this if you want run this at a weekend
        date = '2025-03-07'
        # Step 1: Navigate to the TLDR archives page
        url = f"https://tldr.tech/tech/{date}"
        driver.get(url)

        # Wait for the page to load
        time.sleep(2)

        # Step 3: Extract all links on the new page
        links = driver.find_elements(By.TAG_NAME, 'a')

        # Collect the href attributes
        # extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]

        extracted_links = [
            link.get_attribute('href')
            for link in links
            if link.get_attribute('href') is not None and
               not link.get_attribute('href').startswith("https://tldr.tech") and
               not link.get_attribute('href').startswith("https://jobs") and
               not "advertise" in link.get_attribute('href')
        ]

        # Output the extracted links
        print("Extracted Links:")
        print(len(extracted_links))
        for idx, link in enumerate(extracted_links, start=1):
            print(f"{idx}. {link}")

        # Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
        max_threads = 4

        # ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
            # Für jeden Link in extracted_links wird makerequest aufgerufen
            # enumerate gibt auch den Index zurück, falls du ihn brauchst
            futures = []
            for idx, link in enumerate(extracted_links, start=1):
                future = executor.submit(generate, link)
                futures.append((idx, link, future))
                # print(f"{idx}. {link}")

                # print(future.result())

            for idx, link, future in futures:
                result = future.result()
                # print(f"{idx}. {link} - Result {result}")
                asyncio.run(generateAudioFile(result, idx))


    except WebDriverException as e:
        print(f"Fehler beim Laden der Seite: {e}")


    finally:
        # Close the WebDriver
        driver.quit()