diff --git a/owui-site-crawler.py b/owui-site-crawler.py index 937be55..3e6b6c9 100644 --- a/owui-site-crawler.py +++ b/owui-site-crawler.py @@ -9,23 +9,68 @@ from bs4 import BeautifulSoup from markitdown import MarkItDown import json import logging +from io import BytesIO +import re +import tempfile +import shutil +from pathlib import Path # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +# Try to import Selenium, but make it optional +try: + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.common.exceptions import TimeoutException, WebDriverException + SELENIUM_AVAILABLE = True +except ImportError: + SELENIUM_AVAILABLE = False + logger.warning("Selenium not available. Falling back to simple crawler.") + class WebScraper: - def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None): + def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None, use_selenium=False): self.base_url = base_url self.domain = urlparse(base_url).netloc self.visited_urls = set() self.max_depth = max_depth self.delay = delay self.exclude_patterns = exclude_patterns or [] - self.pages = {} # Dictionary to store URL: HTML content - self.session = requests.Session() + self.pages = {} + self.use_selenium = use_selenium and SELENIUM_AVAILABLE + if self.use_selenium: + self.setup_selenium() + else: + self.session = requests.Session() + + self.base_path = urlparse(base_url).path.rstrip('/') + + def setup_selenium(self): + """Setup Selenium WebDriver with headless Chrome.""" + try: + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + + self.driver = webdriver.Chrome(options=chrome_options) + self.driver.set_page_load_timeout(30) + logger.info("Selenium WebDriver initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize Selenium: {e}") + logger.info("Falling back to requests") + self.use_selenium = False + self.session = requests.Session() + def should_exclude(self, url): """Check if URL should be excluded based on patterns.""" for pattern in self.exclude_patterns: @@ -36,48 +81,102 @@ class WebScraper: def is_valid_url(self, url): """Check if the URL is valid and belongs to the same domain.""" parsed = urlparse(url) - return bool(parsed.netloc) and parsed.netloc == self.domain + if not (parsed.netloc and parsed.netloc == self.domain): + return False + return parsed.path.startswith(self.base_path) - def get_links(self, url, html): - """Extract all links from the HTML content.""" + def get_links_selenium(self, url): + """Extract all links from the page using Selenium.""" + try: + self.driver.get(url) + # Wait for page to load + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + + # Try to wait for main content to load + try: + WebDriverWait(self.driver, 5).until( + EC.presence_of_element_located((By.TAG_NAME, "main")) + ) + except TimeoutException: + pass + + # Get page source after JavaScript execution + html = self.driver.page_source + self.pages[url] = html + + # Extract links + links = set() + for a_tag in self.driver.find_elements(By.TAG_NAME, "a"): + href = a_tag.get_attribute("href") + if href: + full_url = urljoin(url, href) + if self.is_valid_url(full_url) and not self.should_exclude(full_url): + links.add(full_url) + + return list(links), html + + except Exception as e: + logger.error(f"Error getting links with Selenium from {url}: {e}") + return [], "" + + def get_links_requests(self, url, html): + """Extract all links from the HTML content using requests.""" soup = BeautifulSoup(html, 'html.parser') + links = set() for a_tag in soup.find_all('a', href=True): href = a_tag['href'] - # Handle relative URLs full_url = urljoin(url, href) - # Filter URLs to only include those from the same domain if self.is_valid_url(full_url) and not self.should_exclude(full_url): - yield full_url + links.add(full_url) + return list(links) + def get_page_requests(self, url): + """Get page content using requests.""" + try: + response = self.session.get(url, timeout=10) + if response.status_code == 200: + return response.text + else: + logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}") + return None + except Exception as e: + logger.error(f"Error fetching {url} with requests: {e}") + return None + def crawl(self, url=None, depth=0): """Crawl the website starting from the URL up to max_depth.""" if url is None: url = self.base_url - # Stop if we've reached max depth or already visited this URL if depth > self.max_depth or url in self.visited_urls: return - # Mark this URL as visited self.visited_urls.add(url) try: logger.info(f"Crawling: {url} (Depth: {depth})") - response = self.session.get(url, timeout=10) - if response.status_code == 200: - # Store the HTML content - self.pages[url] = response.text - - # Extract and follow links - if depth < self.max_depth: - for link in self.get_links(url, response.text): - # Be nice to the server - add delay - time.sleep(self.delay) - self.crawl(link, depth + 1) + if self.use_selenium: + links, html = self.get_links_selenium(url) + if html: + self.pages[url] = html else: - logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}") - + html = self.get_page_requests(url) + if html: + self.pages[url] = html + links = self.get_links_requests(url, html) + else: + links = [] + + # Follow links + if depth < self.max_depth and links: + logger.info(f"Found {len(links)} links to follow from {url}") + for link in links: + time.sleep(self.delay) + self.crawl(link, depth + 1) + except Exception as e: logger.error(f"Error crawling {url}: {e}") @@ -86,8 +185,10 @@ class WebScraper: return self.pages def close(self): - """Close the requests session.""" - if hasattr(self, 'session') and self.session: + """Close the requests session or Selenium driver.""" + if self.use_selenium and hasattr(self, 'driver'): + self.driver.quit() + elif hasattr(self, 'session'): self.session.close() @@ -163,102 +264,94 @@ class OpenWebUIUploader: logger.error(f"Error creating knowledge base: {e}") raise - def upload_file(self, kb_id, content, filename, content_type="text/markdown"): - """Upload a file to the knowledge base.""" + def validate_content(self, content, filename): + """Validate that content is not empty and has sufficient meaningful text.""" + if not content or not content.strip(): + return False, "Content is empty" + + # Count meaningful lines (not just headers or empty lines) + lines = [line.strip() for line in content.split('\n') if line.strip()] + meaningful_lines = [line for line in lines if not line.startswith('#') and len(line) > 20] + + if len(meaningful_lines) < 3: + return False, f"Not enough meaningful content ({len(meaningful_lines)} lines)" + + # Count words in clean content + clean_content = re.sub(r'#.*?\n', '', content) + clean_content = re.sub(r'```.*?```', '', clean_content, flags=re.DOTALL) + clean_content = re.sub(r'`.*?`', '', clean_content) + clean_content = re.sub(r'\*.*?\*', '', clean_content) + clean_content = clean_content.strip() + + words = clean_content.split() + if len(words) < 50: + return False, f"Content too short ({len(words)} words after cleaning)" + + return True, "Valid content" + + def upload_file_from_path(self, kb_id, file_path, filename, content_type="text/markdown"): + """Upload a file to the knowledge base from a file path.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + except Exception as e: + logger.error(f"Error reading file {file_path}: {e}") + return {"status": "error", "reason": f"read_error: {str(e)}"} + + is_valid, validation_msg = self.validate_content(content, filename) + if not is_valid: + logger.warning(f"Skipping invalid file {filename}: {validation_msg}") + return {"status": "skipped", "reason": validation_msg} + upload_endpoint = f"{self.base_url}/api/v1/files/" - # Create a temporary file for the upload - temp_file_path = f"/tmp/{filename}" - with open(temp_file_path, 'w') as f: - f.write(content) - try: - # Use context manager for file upload request - with open(temp_file_path, 'rb') as f: + with open(file_path, 'rb') as f: files = {'file': (filename, f, content_type)} - with self.session.post( + upload_response = self.session.post( upload_endpoint, headers={"Authorization": f"Bearer {self.api_token}"}, files=files - ) as upload_response: - upload_response.raise_for_status() - file_id = upload_response.json().get('id') + ) + upload_response.raise_for_status() + file_id = upload_response.json().get('id') + + if not file_id: + raise ValueError("No file ID returned from upload") - # Add the file to the knowledge base add_file_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add" - with self.session.post( + add_response = self.session.post( add_file_endpoint, headers={ "Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json" }, json={'file_id': file_id} - ) as add_response: - add_response.raise_for_status() - return add_response.json() + ) + + if add_response.status_code == 400: + error_msg = add_response.text + if "empty" in error_msg.lower(): + logger.warning(f"OpenWebUI rejected file {filename} as empty content") + try: + delete_endpoint = f"{self.base_url}/api/v1/files/{file_id}" + self.session.delete(delete_endpoint) + except: + pass + return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"} + else: + add_response.raise_for_status() + + add_response.raise_for_status() + return add_response.json() except requests.exceptions.RequestException as e: - logger.error(f"Error uploading file: {e}") - raise - finally: - # Clean up the temporary file - if os.path.exists(temp_file_path): - os.unlink(temp_file_path) - - def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"): - """Update an existing file in the knowledge base.""" - # First upload the new version of the file - upload_endpoint = f"{self.base_url}/api/v1/files/" - - # Create a temporary file for the upload - temp_file_path = f"/tmp/{filename}" - with open(temp_file_path, 'w') as f: - f.write(content) - - try: - # Upload the new file - with open(temp_file_path, 'rb') as f: - files = {'file': (filename, f, content_type)} - with self.session.post( - upload_endpoint, - headers={"Authorization": f"Bearer {self.api_token}"}, - files=files - ) as upload_response: - upload_response.raise_for_status() - new_file_id = upload_response.json().get('id') - - # Remove the old file from the knowledge base - remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove" - with self.session.post( - remove_endpoint, - headers={ - "Authorization": f"Bearer {self.api_token}", - "Content-Type": "application/json" - }, - json={'file_id': existing_file_id} - ) as remove_response: - remove_response.raise_for_status() - - # Add the new file to the knowledge base - add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add" - with self.session.post( - add_endpoint, - headers={ - "Authorization": f"Bearer {self.api_token}", - "Content-Type": "application/json" - }, - json={'file_id': new_file_id} - ) as add_response: - add_response.raise_for_status() - return add_response.json() - - except requests.exceptions.RequestException as e: - logger.error(f"Error updating file: {e}") - raise - finally: - # Clean up the temporary file - if os.path.exists(temp_file_path): - os.unlink(temp_file_path) + logger.error(f"Error uploading file {filename}: {e}") + if hasattr(e, 'response') and e.response is not None: + if e.response.status_code == 400 and "empty" in str(e.response.text).lower(): + logger.warning(f"OpenWebUI rejected file {filename} as empty content") + return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"} + return {"status": "error", "reason": f"upload_error: {str(e)}"} def close(self): """Close the requests session.""" @@ -266,24 +359,82 @@ class OpenWebUIUploader: self.session.close() +def extract_clean_text(html_content, url): + """Extract clean, meaningful text from HTML.""" + soup = BeautifulSoup(html_content, 'html.parser') + + # Remove unwanted elements + for element in soup(["script", "style", "nav", "header", "footer", "aside", + "meta", "link", "button", "form", "input", "select"]): + element.decompose() + + # Try different content selectors + content_selectors = [ + 'main', 'article', '.content', '#content', '.main', '#main', + '.documentation', '#documentation', '.doc', '#doc', + '.page', '#page', '.post', '#post', + 'body' + ] + + content_element = None + for selector in content_selectors: + content_element = soup.select_one(selector) + if content_element: + logger.info(f"Found content using selector: {selector}") + break + + if not content_element: + content_element = soup + + # Extract text with structure + text_parts = [] + + for element in content_element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'div']): + text = element.get_text(strip=True) + if text and len(text) > 10: + if element.name.startswith('h'): + level = int(element.name[1]) + text_parts.append(f"{'#' * level} {text}") + elif element.name == 'li': + text_parts.append(f"- {text}") + else: + text_parts.append(text) + + # Fallback to general text extraction + if len(text_parts) < 3: + text = content_element.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text_parts = [chunk for chunk in chunks if chunk and len(chunk) > 20] + + if text_parts: + content = f"# Source: {url}\n\n" + "\n\n".join(text_parts) + logger.info(f"Extracted {len(text_parts)} content blocks, {len(content)} total chars") + return content + else: + logger.warning(f"No meaningful content extracted from {url}") + return f"# Source: {url}\n\n*No meaningful text content could be extracted from this page.*" + + def convert_to_markdown(html_content, url): - """Convert HTML content to Markdown using MarkItDown.""" + """Convert HTML to Markdown with robust fallbacks.""" + clean_text = extract_clean_text(html_content, url) + if len(clean_text.strip()) > 200: + return clean_text + try: md = MarkItDown() - - # Use BytesIO to provide a binary stream to convert_stream - from io import BytesIO html_bytes = BytesIO(html_content.encode('utf-8')) - - # Convert the HTML to Markdown result = md.convert_stream(html_bytes, mime_type='text/html') - # Add a header with the source URL - markdown_with_header = f"# {url}\n\n{result.text_content}" - return markdown_with_header + if result and hasattr(result, 'text_content') and result.text_content: + markdown_content = result.text_content.strip() + if markdown_content and len(markdown_content) > 200: + return f"# Source: {url}\n\n{markdown_content}" except Exception as e: - logger.error(f"Error converting to markdown: {e}") - return f"# {url}\n\nError converting content: {str(e)}" + logger.warning(f"MarkItDown failed for {url}: {e}") + + return clean_text def is_valid_json(content): @@ -295,6 +446,64 @@ def is_valid_json(content): return False +def create_unique_filename(url): + """Create a unique filename from URL including fragment.""" + parsed = urlparse(url) + + path = parsed.path + if not path or path == '/': + path = 'index' + + fragment = parsed.fragment + if fragment: + fragment_clean = re.sub(r'[^a-zA-Z0-9]', '_', fragment) + filename = f"{path.strip('/')}_{fragment_clean}" + else: + filename = path.strip('/') + + filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) + + if len(filename) < 5: + domain_part = re.sub(r'[^a-zA-Z0-9]', '_', parsed.netloc) + filename = f"{domain_part}_{filename}" + + if not filename.endswith('.md'): + filename = f"{filename}.md" + + return filename + + +def save_files_to_temp_dir(processed_files, temp_dir): + """Save processed files to temporary directory.""" + saved_files = [] + + for file_info in processed_files: + try: + file_path = os.path.join(temp_dir, file_info['filename']) + + counter = 1 + original_path = file_path + while os.path.exists(file_path): + name, ext = os.path.splitext(original_path) + file_path = f"{name}_{counter}{ext}" + counter += 1 + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(file_info['content']) + + saved_files.append({ + 'file_path': file_path, + 'filename': os.path.basename(file_path), + 'content_type': file_info['content_type'], + 'url': file_info['url'] + }) + logger.info(f"Saved file to temp directory: {os.path.basename(file_path)}") + except Exception as e: + logger.error(f"Error saving file {file_info['filename']}: {e}") + + return saved_files + + def main(): parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base') parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token') @@ -308,26 +517,36 @@ def main(): parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints') parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base') parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base') + parser.add_argument('--min-content-length', type=int, default=200, help='Minimum content length to include (default: 200 characters)') + parser.add_argument('--keep-temp-files', action='store_true', help='Keep temporary files for debugging') + parser.add_argument('--use-selenium', action='store_true', help='Use Selenium for JavaScript-rendered sites') args = parser.parse_args() - # Check for conflicting options if args.update and args.skip_existing: logger.error("Cannot use both --update and --skip-existing flags at the same time") return 1 - # Initialize resources that need to be closed + # Check if Selenium is requested but not available + if args.use_selenium and not SELENIUM_AVAILABLE: + logger.warning("Selenium requested but not available. Install with: pip install selenium webdriver-manager") + logger.warning("Falling back to simple crawler.") + args.use_selenium = False + scraper = None uploader = None + temp_dir = None try: - # 1. Crawl the website logger.info(f"Starting web crawl of {args.website_url} to depth {args.depth}") + logger.info(f"Using {'Selenium' if args.use_selenium else 'simple'} crawler") + scraper = WebScraper( base_url=args.website_url, max_depth=args.depth, delay=args.delay, - exclude_patterns=args.exclude or [] + exclude_patterns=args.exclude or [], + use_selenium=args.use_selenium ) scraper.crawl() @@ -338,46 +557,49 @@ def main(): logger.error("No pages were crawled. Exiting.") return 1 - # 2. Process content (convert HTML to Markdown or handle JSON) logger.info("Processing crawled content") processed_files = [] + empty_files = 0 for url, html_content in crawled_pages.items(): - # For JSON content, preserve it as JSON + if not html_content or len(html_content.strip()) < 100: + logger.warning(f"Skipping empty page: {url}") + empty_files += 1 + continue + if url.endswith('.json') or (is_valid_json(html_content) and args.include_json): if is_valid_json(html_content): try: json_obj = json.loads(html_content) pretty_json = json.dumps(json_obj, indent=2) - # Create filename for JSON file - parsed_url = urlparse(url) - filename = f"{parsed_url.netloc}{parsed_url.path}" - filename = filename.replace('/', '_').replace('.', '_') - if not filename.endswith('.json'): - filename = f"{filename}.json" - - processed_files.append({ - 'content': pretty_json, - 'content_type': 'application/json', - 'filename': filename, - 'url': url - }) - logger.info(f"Processed JSON content from {url}") + if len(pretty_json.strip()) >= args.min_content_length: + filename = create_unique_filename(url) + if not filename.endswith('.json'): + filename = f"{filename}.json" + + processed_files.append({ + 'content': pretty_json, + 'content_type': 'application/json', + 'filename': filename, + 'url': url + }) + logger.info(f"Processed JSON content from {url}") + else: + logger.warning(f"Skipping JSON file {url} - content too short") + empty_files += 1 continue except ValueError: - # Not valid JSON despite the extension, fall back to Markdown pass - # For all other content, convert to Markdown markdown_content = convert_to_markdown(html_content, url) - # Create a safe filename - parsed_url = urlparse(url) - filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_') - if not filename.endswith('.md'): - filename = f"{filename}.md" - + if not markdown_content or len(markdown_content.strip()) < args.min_content_length: + logger.warning(f"Skipping {url} - no extractable content found after conversion") + empty_files += 1 + continue + + filename = create_unique_filename(url) processed_files.append({ 'content': markdown_content, 'content_type': 'text/markdown', @@ -385,10 +607,38 @@ def main(): 'url': url }) - logger.info(f"Processed {len(processed_files)} files") + logger.info(f"Processed {len(processed_files)} files, skipped {empty_files} empty files") + + if not processed_files: + logger.error("No files with valid content were processed. Exiting.") + return 1 + + script_dir = Path(__file__).parent + temp_dir = script_dir / "temp_webscraper_files" + temp_dir.mkdir(exist_ok=True) + logger.info(f"Created temporary directory: {temp_dir}") + + saved_files = save_files_to_temp_dir(processed_files, temp_dir) + logger.info(f"Saved {len(saved_files)} files to temporary directory") + + logger.info("=== DEBUG: File Content Analysis ===") + for file_info in saved_files: + try: + with open(file_info['file_path'], 'r', encoding='utf-8') as f: + content = f.read() + lines = content.split('\n') + meaningful_lines = [line for line in lines if line.strip() and not line.startswith('#') and len(line.strip()) > 20] + + logger.info(f"File: {file_info['filename']}") + logger.info(f" Total size: {len(content)} chars") + logger.info(f" Total lines: {len(lines)}") + logger.info(f" Meaningful lines: {len(meaningful_lines)}") + if meaningful_lines: + logger.info(f" First meaningful line: {meaningful_lines[0][:100]}{'...' if len(meaningful_lines[0]) > 100 else ''}") + except Exception as e: + logger.error(f"Error reading saved file {file_info['filename']}: {e}") + logger.info("=== END DEBUG ===") - # 3. Upload to Open WebUI - # First check if a knowledge base with the specified name already exists uploader = OpenWebUIUploader(args.base_url, args.token) existing_kb = uploader.get_knowledge_base_by_name(args.kb_name) @@ -396,7 +646,6 @@ def main(): kb_id = existing_kb.get('id') logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}") else: - # Create a new knowledge base if none exists with that name logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI") kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose) kb_id = kb.get('id') @@ -405,61 +654,46 @@ def main(): return 1 logger.info(f"Created knowledge base with ID: {kb_id}") - # 4. Upload each file success_count = 0 skip_count = 0 update_count = 0 error_count = 0 + empty_skip_count = 0 - for file_info in processed_files: + for file_info in saved_files: try: filename = file_info['filename'] + file_path = file_info['file_path'] existing_file_id = uploader.file_exists_in_kb(kb_id, filename) - # Handle existing files based on options - if existing_file_id: - if args.skip_existing: - logger.info(f"Skipping existing file: {filename}") - skip_count += 1 - continue - elif args.update: - logger.info(f"Updating existing file: {filename}") - uploader.update_file( - kb_id, - existing_file_id, - file_info['content'], - filename, - file_info['content_type'] - ) - update_count += 1 + if existing_file_id and args.skip_existing: + logger.info(f"Skipping existing file: {filename}") + skip_count += 1 + continue + + logger.info(f"Uploading file: {filename}") + result = uploader.upload_file_from_path( + kb_id, + file_path, + filename, + file_info['content_type'] + ) + if isinstance(result, dict) and result.get('status') in ['skipped', 'error']: + if result.get('status') == 'skipped': + empty_skip_count += 1 else: - # Default behavior: add as new file - logger.info(f"Adding duplicate file (existing file will remain): {filename}") - uploader.upload_file( - kb_id, - file_info['content'], - filename, - file_info['content_type'] - ) - success_count += 1 + error_count += 1 + logger.warning(f"Failed to upload {filename}: {result.get('reason')}") else: - # New file - logger.info(f"Uploading new file: {filename}") - uploader.upload_file( - kb_id, - file_info['content'], - filename, - file_info['content_type'] - ) success_count += 1 - # Add a small delay between uploads time.sleep(0.5) + except Exception as e: logger.error(f"Failed to process {file_info['filename']}: {e}") error_count += 1 - logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors") + logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {empty_skip_count} empty/invalid files skipped, {error_count} errors") return 0 @@ -467,11 +701,20 @@ def main(): logger.error(f"An unexpected error occurred: {e}") return 1 finally: - # Ensure all resources are properly closed if scraper: scraper.close() if uploader: uploader.close() + + if temp_dir and temp_dir.exists(): + if args.keep_temp_files: + logger.info(f"Keeping temporary files in: {temp_dir}") + else: + try: + shutil.rmtree(temp_dir) + logger.info("Cleaned up temporary directory") + except Exception as e: + logger.warning(f"Failed to clean up temporary directory {temp_dir}: {e}") if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 6afde0a..d97b41d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ requests beautifulsoup4 -markitdown[all] \ No newline at end of file +markitdown[all] +selenium +webdriver-manager \ No newline at end of file