from transformers import AutoTokenizer, AutoModelForCausalLM from bs4 import BeautifulSoup, Tag import datetime import requests import torch import re NoisePatterns = { '(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>', 'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>', 'Svg': r'<[ ]*(svg)[^>]*?>.*?<\/[ ]*\1[ ]*>', 'Meta+Link': r'<[ ]*(meta|link)[^>]*?[\/]?[ ]*>', 'Comment': r'<[ ]*!--.*?--[ ]*>', 'Base64Img': r'<[ ]*img[^>]+src="data:image\/[^;]+;base64,[^"]+"[^>]*[\/]?[ ]*>', 'DocType': r'', 'DataAttributes': r'[ ]+data-[\w-]+="[^"]*"', 'Classes': r'[ ]+class="[^"]*"', 'EmptyAttributes': r'[ ]+[a-z-]+=""', 'DateTime': r'[ ]+datetime="[^"]*"', 'EmptyTags': r'(?:<[ ]*([a-z]{1,10})[^>]*>[ \t\r\n]*){1,5}(?:<\/[ ]*\1[ ]*>){1,5}', 'EmptyLines': r'^[ \t]*\r?\n', } def RemoveNoise(RawHtml: str) -> str: '''Remove noise from HTML content. Args: RawHtml (str): The raw HTML content. Returns: str: Cleaned HTML content without noise. ''' CleanedHtml = RawHtml for PatternName, Pattern in NoisePatterns.items(): if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE) else: CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE) return CleanedHtml def FetchHtmlContent(Url: str) -> str | int: '''Fetch HTML content from a URL. Args: Url (str): The URL to fetch HTML content from. Returns: str: The raw HTML content. ''' Headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } Response = requests.get(Url, headers=Headers) if Response.status_code == 200: return Response.text else: return Response.status_code def PurifyHtml(Url: str) -> str: # type: ignore Start = datetime.datetime.now() RawHtml = FetchHtmlContent(Url) if isinstance(RawHtml, str): RawCharCount = len(RawHtml) Soup = BeautifulSoup(RawHtml, 'html.parser') PrettifiedHtml = str(Soup.prettify()) Title = Soup.title.string if Soup.title else 'No title found' MetaDesc = Soup.find('meta', attrs={'name': 'description'}) Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found' CleanedHtml = RemoveNoise(PrettifiedHtml) CleanedCharCount = len(CleanedHtml) Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0 Summary = [ '' ] for Line in Summary: print(Line) Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2') Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2', torch_dtype=torch.float32, device_map='cpu') Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}' Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192) Outputs = Model.generate(Inputs.input_ids, max_new_tokens=8192, do_sample=False) SummaryOutput = Tokenizer.decode(Outputs[0], skip_special_tokens=True) return SummaryOutput[len(Prompt):].strip() else: print(f'Failed to fetch HTML content. Status code: {RawHtml}')