Spaces:

mgbam
/

builder

Running

File size: 11,659 Bytes

c04089b

def extract_text_from_image(image_path):
    """Extract text from image using OCR"""
    try:
        try:
            pytesseract.get_tesseract_version()
        except Exception:
            return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
        image = cv2.imread(image_path)
        if image is None:
            return "Error: Could not read image file"
        image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
        _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        text=pytesseract.image_to_string(binary,config='--psm 6')
        return text.strip() if text.strip() else "No text found in image"
    except Exception as e:
        return f"Error extracting text from image: {e}"

def extract_text_from_file(file_path):
    if not file_path:
        return ""
    mime,_=mimetypes.guess_type(file_path)
    ext=os.path.splitext(file_path)[1].lower()
    try:
        if ext==".pdf":
            with open(file_path,"rb") as f:
                reader=PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        elif ext in [".txt", ".md"]:
            with open(file_path,"r",encoding="utf-8") as f:
                return f.read()
        elif ext==".csv":
            with open(file_path,"r",encoding="utf-8") as f:
                return f.read()
        elif ext==".docx":
            doc=docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs])
        elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
            return extract_text_from_image(file_path)
        else:
            return ""
    except Exception as e:
        return f"Error extracting text: {e}"

def extract_website_content(url: str) -> str:
    """Extract HTML code and content from a website URL"""
    try:
        parsed_url=urlparse(url)
        if not parsed_url.scheme:
            url="https://"+url
            parsed_url=urlparse(url)
        if not parsed_url.netloc:
            return "Error: Invalid URL provided"
        headers={
            'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language':'en-US,en;q=0.9',
            'Accept-Encoding':'gzip, deflate, br',
            'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
            'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
        }
        session=requests.Session()
        session.headers.update(headers)
        max_retries=3
        for attempt in range(max_retries):
            try:
                response=session.get(url,timeout=15,allow_redirects=True)
                response.raise_for_status()
                break
            except requests.exceptions.HTTPError as e:
                if e.response.status_code==403 and attempt<max_retries-1:
                    session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                    continue
                else:
                    raise
        try:
            response.encoding=response.apparent_encoding
            raw_html=response.text
        except:
            raw_html=response.content.decode('utf-8',errors='ignore')
        if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
            try:
                raw_html=response.content.decode('latin-1',errors='ignore')
            except:
                try:
                    raw_html=response.content.decode('utf-8',errors='ignore')
                except:
                    raw_html=response.content.decode('cp1252',errors='ignore')
        soup=BeautifulSoup(raw_html,'html.parser')
        title=soup.find('title')
        title_text=title.get_text().strip() if title else "No title found"
        meta_desc=soup.find('meta',attrs={'name':'description'})
        description=meta_desc.get('content','') if meta_desc else ""
        content_sections=[]
        main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
        for selector in main_selectors:
            elements=soup.select(selector)
            for element in elements:
                text=element.get_text().strip()
                if len(text)>100:
                    content_sections.append(text)
        nav_links=[]
        nav_elements=soup.find_all(['nav','header'])
        for nav in nav_elements:
            links=nav.find_all('a')
            for link in links:
                link_text=link.get_text().strip()
                link_href=link.get('href','')
                if link_text and link_href:
                    nav_links.append(f"{link_text}: {link_href}")
        img_elements=soup.find_all('img')
        for img in img_elements:
            src=img.get('src','')
            if src:
                if src.startswith('//'):
                    absolute_src='https:'+src
                    img['src']=absolute_src
                elif src.startswith('/'):
                    absolute_src=urljoin(url,src)
                    img['src']=absolute_src
                elif not src.startswith(('http://','https://')):
                    absolute_src=urljoin(url,src)
                    img['src']=absolute_src
                data_src=img.get('data-src','')
                if data_src and not src:
                    if data_src.startswith('//'):
                        absolute_data_src='https:'+data_src
                        img['src']=absolute_data_src
                    elif data_src.startswith('/'):
                        absolute_data_src=urljoin(url,data_src)
                        img['src']=absolute_data_src
                    elif not data_src.startswith(('http://','https://')):
                        absolute_data_src=urljoin(url,data_src)
                        img['src']=absolute_data_src
                    else:
                        img['src']=data_src
        elements_with_style=soup.find_all(attrs={'style':True})
        for element in elements_with_style:
            style_attr=element.get('style','')
            import re
            bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
            matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
            for match in matches:
                if match.startswith('//'):
                    absolute_bg='https:'+match
                    style_attr=style_attr.replace(match,absolute_bg)
                elif match.startswith('/'):
                    absolute_bg=urljoin(url,match)
                    style_attr=style_attr.replace(match,absolute_bg)
                elif not match.startswith(('http://','https://')):
                    absolute_bg=urljoin(url,match)
                    style_attr=style_attr.replace(match,absolute_bg)
            element['style']=style_attr
        style_elements=soup.find_all('style')
        for style in style_elements:
            if style.string:
                style_content=style.string
                bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
                matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
                for match in matches:
                    if match.startswith('//'):
                        absolute_bg='https:'+match
                        style_content=style_content.replace(match,absolute_bg)
                    elif match.startswith('/'):
                        absolute_bg=urljoin(url,match)
                        style_content=style_content.replace(match,absolute_bg)
                    elif not match.startswith(('http://','https://')):
                        absolute_bg=urljoin(url,match)
                        style_content=style_content.replace(match,absolute_bg)
                style.string=style_content
        images=[]
        img_elements=soup.find_all('img')
        for img in img_elements:
            src=img.get('src','')
            alt=img.get('alt','')
            if src:
                images.append({'src':src,'alt':alt})
        def test_image_url(img_url):
            try:
                test_response=requests.head(img_url,timeout=5,allow_redirects=True)
                return test_response.status_code==200
            except:
                return False
        working_images=[]
        for img in images[:10]:
            if test_image_url(img['src']):
                working_images.append(img)
        modified_html=str(soup)
        import re
        cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
        cleaned_html=re.sub(r'\s+',' ',cleaned_html)
        cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
        if len(cleaned_html)>15000:
            cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
        if not title_text or title_text=="No title found":
            title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
        if len(cleaned_html.strip())<100:
            website_content=f"""
WEBSITE REDESIGN - EXTRACTION FAILED
====================================
URL: {url}
Title: {title_text}
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
1. The website uses heavy JavaScript to load content dynamically
2. The website has anti-bot protection
3. The website requires authentication
4. The website is using advanced compression or encoding
FALLBACK APPROACH:
Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
1. Create a typical layout for this type of website
2. Use placeholder content that would be appropriate
3. Include modern design elements and responsive features
4. Use a clean, professional design with good typography
5. Make it mobile-friendly and accessible
This will help me create a better design for you."""
            return website_content.strip()
        website_content=f"""
WEBSITE REDESIGN - ORIGINAL HTML CODE
===[TRUNCATED FOR BREVITY]==="""
        return website_content.strip()
    except requests.exceptions.HTTPError as e:
        if e.response.status_code==403:
            return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
        elif e.response.status_code==404:
            return f"Error: Website not found (404). Please check the URL and try again."
        elif e.response.status_code>=500:
            return f"Error: Website server error ({e.response.status_code}). Please try again later."
        else:
            return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
    except requests.exceptions.Timeout:
        return "Error: Request timed out. The website may be slow or unavailable."
    except requests.exceptions.ConnectionError:
        return "Error: Could not connect to the website. Please check your internet connection and the URL."
    except requests.exceptions.RequestException as e:
        return f"Error accessing website: {str(e)}"
    except Exception as e:
        return f"Error extracting website content: {str(e)}"