Spaces:

Defender117
/

tldr_crawlre

Runtime error

File size: 4,909 Bytes

b6204d2

import asyncio

from llama_cpp import Llama
from openai import OpenAI
from selenium import webdriver
from selenium.common import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import concurrent.futures
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime
import os
from GenerateAIPodcast import generateMp3
from btts import generateAudioFile

# client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")


'''def make_request(link):

    print("-----------------------------------------------------------------------------------------")

    print("Make Request is called")

    try:



        completion = client.chat.completions.create(

            model="model-identifier",



            messages=[

                {"role": "system",

                 "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},

                {"role": "user", "content": f"Please summarize this website: {link}."}

            ],

            temperature=0.7,

        )

        # print(f"Thread: {completion.choices[0].message}")

        # print("TEST:", completion.choices[0].message)

        message = completion.choices[0].message.content

        return message

    except Exception as e:

        print(f"Thread  encountered an error: {e}^")

        '''



llm = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
	filename="llama-3.2-1b-instruct-q8_0.gguf",
)

def generate(link:str):
    ## use the pipeline to generate text from given input text
    output= llm.create_chat_completion(
	messages = [
		{"role": "system",
		 "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
		{"role": "user", "content": f"Please summarize this website: {link}."}
	]
)

    ## return the generate text in Json reposnfe
    return output['choices'][0]['message']['content']



def run_tldr_crawler():
    # Setup Selenium WebDriver
    options = webdriver.ChromeOptions()
    # options.add_argument()  # Run in headless mode (no browser UI)
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    try:
        date = datetime.today().strftime('%Y-%m-%d')
        print(date)
        # Comment this if you want run this at a weekend
        date = '2025-03-07'
        # Step 1: Navigate to the TLDR archives page
        url = f"https://tldr.tech/tech/{date}"
        driver.get(url)

        # Wait for the page to load
        time.sleep(2)

        # Step 3: Extract all links on the new page
        links = driver.find_elements(By.TAG_NAME, 'a')

        # Collect the href attributes
        # extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]

        extracted_links = [
            link.get_attribute('href')
            for link in links
            if link.get_attribute('href') is not None and
               not link.get_attribute('href').startswith("https://tldr.tech") and
               not link.get_attribute('href').startswith("https://jobs") and
               not "advertise" in link.get_attribute('href')
        ]

        # Output the extracted links
        print("Extracted Links:")
        print(len(extracted_links))
        for idx, link in enumerate(extracted_links, start=1):
            print(f"{idx}. {link}")

        # Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
        max_threads = 4

        # ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
            # Für jeden Link in extracted_links wird makerequest aufgerufen
            # enumerate gibt auch den Index zurück, falls du ihn brauchst
            futures = []
            for idx, link in enumerate(extracted_links, start=1):
                future = executor.submit(generate, link)
                futures.append((idx, link, future))
                # print(f"{idx}. {link}")

                # print(future.result())

            for idx, link, future in futures:
                result = future.result()
                # print(f"{idx}. {link} - Result {result}")
                asyncio.run(generateAudioFile(result, idx))


    except WebDriverException as e:
        print(f"Fehler beim Laden der Seite: {e}")


    finally:
        # Close the WebDriver
        driver.quit()