builder / web_scraper.py
mgbam's picture
Rename services.py to web_scraper.py
c04089b verified
def extract_text_from_image(image_path):
"""Extract text from image using OCR"""
try:
try:
pytesseract.get_tesseract_version()
except Exception:
return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
image = cv2.imread(image_path)
if image is None:
return "Error: Could not read image file"
image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
_,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
text=pytesseract.image_to_string(binary,config='--psm 6')
return text.strip() if text.strip() else "No text found in image"
except Exception as e:
return f"Error extracting text from image: {e}"
def extract_text_from_file(file_path):
if not file_path:
return ""
mime,_=mimetypes.guess_type(file_path)
ext=os.path.splitext(file_path)[1].lower()
try:
if ext==".pdf":
with open(file_path,"rb") as f:
reader=PyPDF2.PdfReader(f)
return "\n".join(page.extract_text() or "" for page in reader.pages)
elif ext in [".txt", ".md"]:
with open(file_path,"r",encoding="utf-8") as f:
return f.read()
elif ext==".csv":
with open(file_path,"r",encoding="utf-8") as f:
return f.read()
elif ext==".docx":
doc=docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
return extract_text_from_image(file_path)
else:
return ""
except Exception as e:
return f"Error extracting text: {e}"
def extract_website_content(url: str) -> str:
"""Extract HTML code and content from a website URL"""
try:
parsed_url=urlparse(url)
if not parsed_url.scheme:
url="https://"+url
parsed_url=urlparse(url)
if not parsed_url.netloc:
return "Error: Invalid URL provided"
headers={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.9',
'Accept-Encoding':'gzip, deflate, br',
'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
}
session=requests.Session()
session.headers.update(headers)
max_retries=3
for attempt in range(max_retries):
try:
response=session.get(url,timeout=15,allow_redirects=True)
response.raise_for_status()
break
except requests.exceptions.HTTPError as e:
if e.response.status_code==403 and attempt<max_retries-1:
session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
continue
else:
raise
try:
response.encoding=response.apparent_encoding
raw_html=response.text
except:
raw_html=response.content.decode('utf-8',errors='ignore')
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
try:
raw_html=response.content.decode('latin-1',errors='ignore')
except:
try:
raw_html=response.content.decode('utf-8',errors='ignore')
except:
raw_html=response.content.decode('cp1252',errors='ignore')
soup=BeautifulSoup(raw_html,'html.parser')
title=soup.find('title')
title_text=title.get_text().strip() if title else "No title found"
meta_desc=soup.find('meta',attrs={'name':'description'})
description=meta_desc.get('content','') if meta_desc else ""
content_sections=[]
main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
for selector in main_selectors:
elements=soup.select(selector)
for element in elements:
text=element.get_text().strip()
if len(text)>100:
content_sections.append(text)
nav_links=[]
nav_elements=soup.find_all(['nav','header'])
for nav in nav_elements:
links=nav.find_all('a')
for link in links:
link_text=link.get_text().strip()
link_href=link.get('href','')
if link_text and link_href:
nav_links.append(f"{link_text}: {link_href}")
img_elements=soup.find_all('img')
for img in img_elements:
src=img.get('src','')
if src:
if src.startswith('//'):
absolute_src='https:'+src
img['src']=absolute_src
elif src.startswith('/'):
absolute_src=urljoin(url,src)
img['src']=absolute_src
elif not src.startswith(('http://','https://')):
absolute_src=urljoin(url,src)
img['src']=absolute_src
data_src=img.get('data-src','')
if data_src and not src:
if data_src.startswith('//'):
absolute_data_src='https:'+data_src
img['src']=absolute_data_src
elif data_src.startswith('/'):
absolute_data_src=urljoin(url,data_src)
img['src']=absolute_data_src
elif not data_src.startswith(('http://','https://')):
absolute_data_src=urljoin(url,data_src)
img['src']=absolute_data_src
else:
img['src']=data_src
elements_with_style=soup.find_all(attrs={'style':True})
for element in elements_with_style:
style_attr=element.get('style','')
import re
bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
for match in matches:
if match.startswith('//'):
absolute_bg='https:'+match
style_attr=style_attr.replace(match,absolute_bg)
elif match.startswith('/'):
absolute_bg=urljoin(url,match)
style_attr=style_attr.replace(match,absolute_bg)
elif not match.startswith(('http://','https://')):
absolute_bg=urljoin(url,match)
style_attr=style_attr.replace(match,absolute_bg)
element['style']=style_attr
style_elements=soup.find_all('style')
for style in style_elements:
if style.string:
style_content=style.string
bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
for match in matches:
if match.startswith('//'):
absolute_bg='https:'+match
style_content=style_content.replace(match,absolute_bg)
elif match.startswith('/'):
absolute_bg=urljoin(url,match)
style_content=style_content.replace(match,absolute_bg)
elif not match.startswith(('http://','https://')):
absolute_bg=urljoin(url,match)
style_content=style_content.replace(match,absolute_bg)
style.string=style_content
images=[]
img_elements=soup.find_all('img')
for img in img_elements:
src=img.get('src','')
alt=img.get('alt','')
if src:
images.append({'src':src,'alt':alt})
def test_image_url(img_url):
try:
test_response=requests.head(img_url,timeout=5,allow_redirects=True)
return test_response.status_code==200
except:
return False
working_images=[]
for img in images[:10]:
if test_image_url(img['src']):
working_images.append(img)
modified_html=str(soup)
import re
cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
cleaned_html=re.sub(r'\s+',' ',cleaned_html)
cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
if len(cleaned_html)>15000:
cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
if not title_text or title_text=="No title found":
title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
if len(cleaned_html.strip())<100:
website_content=f"""
WEBSITE REDESIGN - EXTRACTION FAILED
====================================
URL: {url}
Title: {title_text}
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
1. The website uses heavy JavaScript to load content dynamically
2. The website has anti-bot protection
3. The website requires authentication
4. The website is using advanced compression or encoding
FALLBACK APPROACH:
Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
1. Create a typical layout for this type of website
2. Use placeholder content that would be appropriate
3. Include modern design elements and responsive features
4. Use a clean, professional design with good typography
5. Make it mobile-friendly and accessible
This will help me create a better design for you."""
return website_content.strip()
website_content=f"""
WEBSITE REDESIGN - ORIGINAL HTML CODE
===[TRUNCATED FOR BREVITY]==="""
return website_content.strip()
except requests.exceptions.HTTPError as e:
if e.response.status_code==403:
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
elif e.response.status_code==404:
return f"Error: Website not found (404). Please check the URL and try again."
elif e.response.status_code>=500:
return f"Error: Website server error ({e.response.status_code}). Please try again later."
else:
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
except requests.exceptions.Timeout:
return "Error: Request timed out. The website may be slow or unavailable."
except requests.exceptions.ConnectionError:
return "Error: Could not connect to the website. Please check your internet connection and the URL."
except requests.exceptions.RequestException as e:
return f"Error accessing website: {str(e)}"
except Exception as e:
return f"Error extracting website content: {str(e)}"