File size: 11,659 Bytes
c04089b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
def extract_text_from_image(image_path):
    """Extract text from image using OCR"""
    try:
        try:
            pytesseract.get_tesseract_version()
        except Exception:
            return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
        image = cv2.imread(image_path)
        if image is None:
            return "Error: Could not read image file"
        image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
        _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        text=pytesseract.image_to_string(binary,config='--psm 6')
        return text.strip() if text.strip() else "No text found in image"
    except Exception as e:
        return f"Error extracting text from image: {e}"

def extract_text_from_file(file_path):
    if not file_path:
        return ""
    mime,_=mimetypes.guess_type(file_path)
    ext=os.path.splitext(file_path)[1].lower()
    try:
        if ext==".pdf":
            with open(file_path,"rb") as f:
                reader=PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        elif ext in [".txt", ".md"]:
            with open(file_path,"r",encoding="utf-8") as f:
                return f.read()
        elif ext==".csv":
            with open(file_path,"r",encoding="utf-8") as f:
                return f.read()
        elif ext==".docx":
            doc=docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs])
        elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
            return extract_text_from_image(file_path)
        else:
            return ""
    except Exception as e:
        return f"Error extracting text: {e}"

def extract_website_content(url: str) -> str:
    """Extract HTML code and content from a website URL"""
    try:
        parsed_url=urlparse(url)
        if not parsed_url.scheme:
            url="https://"+url
            parsed_url=urlparse(url)
        if not parsed_url.netloc:
            return "Error: Invalid URL provided"
        headers={
            'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language':'en-US,en;q=0.9',
            'Accept-Encoding':'gzip, deflate, br',
            'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
            'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
        }
        session=requests.Session()
        session.headers.update(headers)
        max_retries=3
        for attempt in range(max_retries):
            try:
                response=session.get(url,timeout=15,allow_redirects=True)
                response.raise_for_status()
                break
            except requests.exceptions.HTTPError as e:
                if e.response.status_code==403 and attempt<max_retries-1:
                    session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                    continue
                else:
                    raise
        try:
            response.encoding=response.apparent_encoding
            raw_html=response.text
        except:
            raw_html=response.content.decode('utf-8',errors='ignore')
        if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
            try:
                raw_html=response.content.decode('latin-1',errors='ignore')
            except:
                try:
                    raw_html=response.content.decode('utf-8',errors='ignore')
                except:
                    raw_html=response.content.decode('cp1252',errors='ignore')
        soup=BeautifulSoup(raw_html,'html.parser')
        title=soup.find('title')
        title_text=title.get_text().strip() if title else "No title found"
        meta_desc=soup.find('meta',attrs={'name':'description'})
        description=meta_desc.get('content','') if meta_desc else ""
        content_sections=[]
        main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
        for selector in main_selectors:
            elements=soup.select(selector)
            for element in elements:
                text=element.get_text().strip()
                if len(text)>100:
                    content_sections.append(text)
        nav_links=[]
        nav_elements=soup.find_all(['nav','header'])
        for nav in nav_elements:
            links=nav.find_all('a')
            for link in links:
                link_text=link.get_text().strip()
                link_href=link.get('href','')
                if link_text and link_href:
                    nav_links.append(f"{link_text}: {link_href}")
        img_elements=soup.find_all('img')
        for img in img_elements:
            src=img.get('src','')
            if src:
                if src.startswith('//'):
                    absolute_src='https:'+src
                    img['src']=absolute_src
                elif src.startswith('/'):
                    absolute_src=urljoin(url,src)
                    img['src']=absolute_src
                elif not src.startswith(('http://','https://')):
                    absolute_src=urljoin(url,src)
                    img['src']=absolute_src
                data_src=img.get('data-src','')
                if data_src and not src:
                    if data_src.startswith('//'):
                        absolute_data_src='https:'+data_src
                        img['src']=absolute_data_src
                    elif data_src.startswith('/'):
                        absolute_data_src=urljoin(url,data_src)
                        img['src']=absolute_data_src
                    elif not data_src.startswith(('http://','https://')):
                        absolute_data_src=urljoin(url,data_src)
                        img['src']=absolute_data_src
                    else:
                        img['src']=data_src
        elements_with_style=soup.find_all(attrs={'style':True})
        for element in elements_with_style:
            style_attr=element.get('style','')
            import re
            bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
            matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
            for match in matches:
                if match.startswith('//'):
                    absolute_bg='https:'+match
                    style_attr=style_attr.replace(match,absolute_bg)
                elif match.startswith('/'):
                    absolute_bg=urljoin(url,match)
                    style_attr=style_attr.replace(match,absolute_bg)
                elif not match.startswith(('http://','https://')):
                    absolute_bg=urljoin(url,match)
                    style_attr=style_attr.replace(match,absolute_bg)
            element['style']=style_attr
        style_elements=soup.find_all('style')
        for style in style_elements:
            if style.string:
                style_content=style.string
                bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
                matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
                for match in matches:
                    if match.startswith('//'):
                        absolute_bg='https:'+match
                        style_content=style_content.replace(match,absolute_bg)
                    elif match.startswith('/'):
                        absolute_bg=urljoin(url,match)
                        style_content=style_content.replace(match,absolute_bg)
                    elif not match.startswith(('http://','https://')):
                        absolute_bg=urljoin(url,match)
                        style_content=style_content.replace(match,absolute_bg)
                style.string=style_content
        images=[]
        img_elements=soup.find_all('img')
        for img in img_elements:
            src=img.get('src','')
            alt=img.get('alt','')
            if src:
                images.append({'src':src,'alt':alt})
        def test_image_url(img_url):
            try:
                test_response=requests.head(img_url,timeout=5,allow_redirects=True)
                return test_response.status_code==200
            except:
                return False
        working_images=[]
        for img in images[:10]:
            if test_image_url(img['src']):
                working_images.append(img)
        modified_html=str(soup)
        import re
        cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
        cleaned_html=re.sub(r'\s+',' ',cleaned_html)
        cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
        if len(cleaned_html)>15000:
            cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
        if not title_text or title_text=="No title found":
            title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
        if len(cleaned_html.strip())<100:
            website_content=f"""
WEBSITE REDESIGN - EXTRACTION FAILED
====================================
URL: {url}
Title: {title_text}
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
1. The website uses heavy JavaScript to load content dynamically
2. The website has anti-bot protection
3. The website requires authentication
4. The website is using advanced compression or encoding
FALLBACK APPROACH:
Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
1. Create a typical layout for this type of website
2. Use placeholder content that would be appropriate
3. Include modern design elements and responsive features
4. Use a clean, professional design with good typography
5. Make it mobile-friendly and accessible
This will help me create a better design for you."""
            return website_content.strip()
        website_content=f"""
WEBSITE REDESIGN - ORIGINAL HTML CODE
===[TRUNCATED FOR BREVITY]==="""
        return website_content.strip()
    except requests.exceptions.HTTPError as e:
        if e.response.status_code==403:
            return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
        elif e.response.status_code==404:
            return f"Error: Website not found (404). Please check the URL and try again."
        elif e.response.status_code>=500:
            return f"Error: Website server error ({e.response.status_code}). Please try again later."
        else:
            return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
    except requests.exceptions.Timeout:
        return "Error: Request timed out. The website may be slow or unavailable."
    except requests.exceptions.ConnectionError:
        return "Error: Could not connect to the website. Please check your internet connection and the URL."
    except requests.exceptions.RequestException as e:
        return f"Error accessing website: {str(e)}"
    except Exception as e:
        return f"Error extracting website content: {str(e)}"