def extract_text_from_image(image_path): """Extract text from image using OCR""" try: try: pytesseract.get_tesseract_version() except Exception: return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions." image = cv2.imread(image_path) if image is None: return "Error: Could not read image file" image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY) _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) text=pytesseract.image_to_string(binary,config='--psm 6') return text.strip() if text.strip() else "No text found in image" except Exception as e: return f"Error extracting text from image: {e}" def extract_text_from_file(file_path): if not file_path: return "" mime,_=mimetypes.guess_type(file_path) ext=os.path.splitext(file_path)[1].lower() try: if ext==".pdf": with open(file_path,"rb") as f: reader=PyPDF2.PdfReader(f) return "\n".join(page.extract_text() or "" for page in reader.pages) elif ext in [".txt", ".md"]: with open(file_path,"r",encoding="utf-8") as f: return f.read() elif ext==".csv": with open(file_path,"r",encoding="utf-8") as f: return f.read() elif ext==".docx": doc=docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]: return extract_text_from_image(file_path) else: return "" except Exception as e: return f"Error extracting text: {e}" def extract_website_content(url: str) -> str: """Extract HTML code and content from a website URL""" try: parsed_url=urlparse(url) if not parsed_url.scheme: url="https://"+url parsed_url=urlparse(url) if not parsed_url.netloc: return "Error: Invalid URL provided" headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language':'en-US,en;q=0.9', 'Accept-Encoding':'gzip, deflate, br', 'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1', 'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0' } session=requests.Session() session.headers.update(headers) max_retries=3 for attempt in range(max_retries): try: response=session.get(url,timeout=15,allow_redirects=True) response.raise_for_status() break except requests.exceptions.HTTPError as e: if e.response.status_code==403 and attempt100: content_sections.append(text) nav_links=[] nav_elements=soup.find_all(['nav','header']) for nav in nav_elements: links=nav.find_all('a') for link in links: link_text=link.get_text().strip() link_href=link.get('href','') if link_text and link_href: nav_links.append(f"{link_text}: {link_href}") img_elements=soup.find_all('img') for img in img_elements: src=img.get('src','') if src: if src.startswith('//'): absolute_src='https:'+src img['src']=absolute_src elif src.startswith('/'): absolute_src=urljoin(url,src) img['src']=absolute_src elif not src.startswith(('http://','https://')): absolute_src=urljoin(url,src) img['src']=absolute_src data_src=img.get('data-src','') if data_src and not src: if data_src.startswith('//'): absolute_data_src='https:'+data_src img['src']=absolute_data_src elif data_src.startswith('/'): absolute_data_src=urljoin(url,data_src) img['src']=absolute_data_src elif not data_src.startswith(('http://','https://')): absolute_data_src=urljoin(url,data_src) img['src']=absolute_data_src else: img['src']=data_src elements_with_style=soup.find_all(attrs={'style':True}) for element in elements_with_style: style_attr=element.get('style','') import re bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' matches=re.findall(bg_pattern,style_attr, re.IGNORECASE) for match in matches: if match.startswith('//'): absolute_bg='https:'+match style_attr=style_attr.replace(match,absolute_bg) elif match.startswith('/'): absolute_bg=urljoin(url,match) style_attr=style_attr.replace(match,absolute_bg) elif not match.startswith(('http://','https://')): absolute_bg=urljoin(url,match) style_attr=style_attr.replace(match,absolute_bg) element['style']=style_attr style_elements=soup.find_all('style') for style in style_elements: if style.string: style_content=style.string bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' matches=re.findall(bg_pattern,style_content, re.IGNORECASE) for match in matches: if match.startswith('//'): absolute_bg='https:'+match style_content=style_content.replace(match,absolute_bg) elif match.startswith('/'): absolute_bg=urljoin(url,match) style_content=style_content.replace(match,absolute_bg) elif not match.startswith(('http://','https://')): absolute_bg=urljoin(url,match) style_content=style_content.replace(match,absolute_bg) style.string=style_content images=[] img_elements=soup.find_all('img') for img in img_elements: src=img.get('src','') alt=img.get('alt','') if src: images.append({'src':src,'alt':alt}) def test_image_url(img_url): try: test_response=requests.head(img_url,timeout=5,allow_redirects=True) return test_response.status_code==200 except: return False working_images=[] for img in images[:10]: if test_image_url(img['src']): working_images.append(img) modified_html=str(soup) import re cleaned_html=re.sub(r'','',modified_html,flags=re.DOTALL) cleaned_html=re.sub(r'\s+',' ',cleaned_html) cleaned_html=re.sub(r'>\s+<','><',cleaned_html) if len(cleaned_html)>15000: cleaned_html=cleaned_html[:15000]+"\n" if not title_text or title_text=="No title found": title_text=url.split('/')[-1] or url.split('/')[-2] or "Website" if len(cleaned_html.strip())<100: website_content=f""" WEBSITE REDESIGN - EXTRACTION FAILED ==================================== URL: {url} Title: {title_text} ERROR: Could not extract meaningful HTML content from this website. This could be due to: 1. The website uses heavy JavaScript to load content dynamically 2. The website has anti-bot protection 3. The website requires authentication 4. The website is using advanced compression or encoding FALLBACK APPROACH: Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can: 1. Create a typical layout for this type of website 2. Use placeholder content that would be appropriate 3. Include modern design elements and responsive features 4. Use a clean, professional design with good typography 5. Make it mobile-friendly and accessible This will help me create a better design for you.""" return website_content.strip() website_content=f""" WEBSITE REDESIGN - ORIGINAL HTML CODE ===[TRUNCATED FOR BREVITY]===""" return website_content.strip() except requests.exceptions.HTTPError as e: if e.response.status_code==403: return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead." elif e.response.status_code==404: return f"Error: Website not found (404). Please check the URL and try again." elif e.response.status_code>=500: return f"Error: Website server error ({e.response.status_code}). Please try again later." else: return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}" except requests.exceptions.Timeout: return "Error: Request timed out. The website may be slow or unavailable." except requests.exceptions.ConnectionError: return "Error: Could not connect to the website. Please check your internet connection and the URL." except requests.exceptions.RequestException as e: return f"Error accessing website: {str(e)}" except Exception as e: return f"Error extracting website content: {str(e)}"