|
def extract_text_from_image(image_path): |
|
"""Extract text from image using OCR""" |
|
try: |
|
try: |
|
pytesseract.get_tesseract_version() |
|
except Exception: |
|
return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions." |
|
image = cv2.imread(image_path) |
|
if image is None: |
|
return "Error: Could not read image file" |
|
image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) |
|
gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY) |
|
_,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) |
|
text=pytesseract.image_to_string(binary,config='--psm 6') |
|
return text.strip() if text.strip() else "No text found in image" |
|
except Exception as e: |
|
return f"Error extracting text from image: {e}" |
|
|
|
def extract_text_from_file(file_path): |
|
if not file_path: |
|
return "" |
|
mime,_=mimetypes.guess_type(file_path) |
|
ext=os.path.splitext(file_path)[1].lower() |
|
try: |
|
if ext==".pdf": |
|
with open(file_path,"rb") as f: |
|
reader=PyPDF2.PdfReader(f) |
|
return "\n".join(page.extract_text() or "" for page in reader.pages) |
|
elif ext in [".txt", ".md"]: |
|
with open(file_path,"r",encoding="utf-8") as f: |
|
return f.read() |
|
elif ext==".csv": |
|
with open(file_path,"r",encoding="utf-8") as f: |
|
return f.read() |
|
elif ext==".docx": |
|
doc=docx.Document(file_path) |
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]: |
|
return extract_text_from_image(file_path) |
|
else: |
|
return "" |
|
except Exception as e: |
|
return f"Error extracting text: {e}" |
|
|
|
def extract_website_content(url: str) -> str: |
|
"""Extract HTML code and content from a website URL""" |
|
try: |
|
parsed_url=urlparse(url) |
|
if not parsed_url.scheme: |
|
url="https://"+url |
|
parsed_url=urlparse(url) |
|
if not parsed_url.netloc: |
|
return "Error: Invalid URL provided" |
|
headers={ |
|
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
'Accept-Language':'en-US,en;q=0.9', |
|
'Accept-Encoding':'gzip, deflate, br', |
|
'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1', |
|
'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0' |
|
} |
|
session=requests.Session() |
|
session.headers.update(headers) |
|
max_retries=3 |
|
for attempt in range(max_retries): |
|
try: |
|
response=session.get(url,timeout=15,allow_redirects=True) |
|
response.raise_for_status() |
|
break |
|
except requests.exceptions.HTTPError as e: |
|
if e.response.status_code==403 and attempt<max_retries-1: |
|
session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' |
|
continue |
|
else: |
|
raise |
|
try: |
|
response.encoding=response.apparent_encoding |
|
raw_html=response.text |
|
except: |
|
raw_html=response.content.decode('utf-8',errors='ignore') |
|
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'): |
|
try: |
|
raw_html=response.content.decode('latin-1',errors='ignore') |
|
except: |
|
try: |
|
raw_html=response.content.decode('utf-8',errors='ignore') |
|
except: |
|
raw_html=response.content.decode('cp1252',errors='ignore') |
|
soup=BeautifulSoup(raw_html,'html.parser') |
|
title=soup.find('title') |
|
title_text=title.get_text().strip() if title else "No title found" |
|
meta_desc=soup.find('meta',attrs={'name':'description'}) |
|
description=meta_desc.get('content','') if meta_desc else "" |
|
content_sections=[] |
|
main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body'] |
|
for selector in main_selectors: |
|
elements=soup.select(selector) |
|
for element in elements: |
|
text=element.get_text().strip() |
|
if len(text)>100: |
|
content_sections.append(text) |
|
nav_links=[] |
|
nav_elements=soup.find_all(['nav','header']) |
|
for nav in nav_elements: |
|
links=nav.find_all('a') |
|
for link in links: |
|
link_text=link.get_text().strip() |
|
link_href=link.get('href','') |
|
if link_text and link_href: |
|
nav_links.append(f"{link_text}: {link_href}") |
|
img_elements=soup.find_all('img') |
|
for img in img_elements: |
|
src=img.get('src','') |
|
if src: |
|
if src.startswith('//'): |
|
absolute_src='https:'+src |
|
img['src']=absolute_src |
|
elif src.startswith('/'): |
|
absolute_src=urljoin(url,src) |
|
img['src']=absolute_src |
|
elif not src.startswith(('http://','https://')): |
|
absolute_src=urljoin(url,src) |
|
img['src']=absolute_src |
|
data_src=img.get('data-src','') |
|
if data_src and not src: |
|
if data_src.startswith('//'): |
|
absolute_data_src='https:'+data_src |
|
img['src']=absolute_data_src |
|
elif data_src.startswith('/'): |
|
absolute_data_src=urljoin(url,data_src) |
|
img['src']=absolute_data_src |
|
elif not data_src.startswith(('http://','https://')): |
|
absolute_data_src=urljoin(url,data_src) |
|
img['src']=absolute_data_src |
|
else: |
|
img['src']=data_src |
|
elements_with_style=soup.find_all(attrs={'style':True}) |
|
for element in elements_with_style: |
|
style_attr=element.get('style','') |
|
import re |
|
bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' |
|
matches=re.findall(bg_pattern,style_attr, re.IGNORECASE) |
|
for match in matches: |
|
if match.startswith('//'): |
|
absolute_bg='https:'+match |
|
style_attr=style_attr.replace(match,absolute_bg) |
|
elif match.startswith('/'): |
|
absolute_bg=urljoin(url,match) |
|
style_attr=style_attr.replace(match,absolute_bg) |
|
elif not match.startswith(('http://','https://')): |
|
absolute_bg=urljoin(url,match) |
|
style_attr=style_attr.replace(match,absolute_bg) |
|
element['style']=style_attr |
|
style_elements=soup.find_all('style') |
|
for style in style_elements: |
|
if style.string: |
|
style_content=style.string |
|
bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' |
|
matches=re.findall(bg_pattern,style_content, re.IGNORECASE) |
|
for match in matches: |
|
if match.startswith('//'): |
|
absolute_bg='https:'+match |
|
style_content=style_content.replace(match,absolute_bg) |
|
elif match.startswith('/'): |
|
absolute_bg=urljoin(url,match) |
|
style_content=style_content.replace(match,absolute_bg) |
|
elif not match.startswith(('http://','https://')): |
|
absolute_bg=urljoin(url,match) |
|
style_content=style_content.replace(match,absolute_bg) |
|
style.string=style_content |
|
images=[] |
|
img_elements=soup.find_all('img') |
|
for img in img_elements: |
|
src=img.get('src','') |
|
alt=img.get('alt','') |
|
if src: |
|
images.append({'src':src,'alt':alt}) |
|
def test_image_url(img_url): |
|
try: |
|
test_response=requests.head(img_url,timeout=5,allow_redirects=True) |
|
return test_response.status_code==200 |
|
except: |
|
return False |
|
working_images=[] |
|
for img in images[:10]: |
|
if test_image_url(img['src']): |
|
working_images.append(img) |
|
modified_html=str(soup) |
|
import re |
|
cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL) |
|
cleaned_html=re.sub(r'\s+',' ',cleaned_html) |
|
cleaned_html=re.sub(r'>\s+<','><',cleaned_html) |
|
if len(cleaned_html)>15000: |
|
cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->" |
|
if not title_text or title_text=="No title found": |
|
title_text=url.split('/')[-1] or url.split('/')[-2] or "Website" |
|
if len(cleaned_html.strip())<100: |
|
website_content=f""" |
|
WEBSITE REDESIGN - EXTRACTION FAILED |
|
==================================== |
|
URL: {url} |
|
Title: {title_text} |
|
ERROR: Could not extract meaningful HTML content from this website. This could be due to: |
|
1. The website uses heavy JavaScript to load content dynamically |
|
2. The website has anti-bot protection |
|
3. The website requires authentication |
|
4. The website is using advanced compression or encoding |
|
FALLBACK APPROACH: |
|
Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can: |
|
1. Create a typical layout for this type of website |
|
2. Use placeholder content that would be appropriate |
|
3. Include modern design elements and responsive features |
|
4. Use a clean, professional design with good typography |
|
5. Make it mobile-friendly and accessible |
|
This will help me create a better design for you.""" |
|
return website_content.strip() |
|
website_content=f""" |
|
WEBSITE REDESIGN - ORIGINAL HTML CODE |
|
===[TRUNCATED FOR BREVITY]===""" |
|
return website_content.strip() |
|
except requests.exceptions.HTTPError as e: |
|
if e.response.status_code==403: |
|
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead." |
|
elif e.response.status_code==404: |
|
return f"Error: Website not found (404). Please check the URL and try again." |
|
elif e.response.status_code>=500: |
|
return f"Error: Website server error ({e.response.status_code}). Please try again later." |
|
else: |
|
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}" |
|
except requests.exceptions.Timeout: |
|
return "Error: Request timed out. The website may be slow or unavailable." |
|
except requests.exceptions.ConnectionError: |
|
return "Error: Could not connect to the website. Please check your internet connection and the URL." |
|
except requests.exceptions.RequestException as e: |
|
return f"Error accessing website: {str(e)}" |
|
except Exception as e: |
|
return f"Error extracting website content: {str(e)}" |
|
|