Fix when Crawling JS Sites

This commit is contained in:
Muhammad Tamir
2025-11-14 21:08:28 +07:00
parent d7705cc672
commit 6f2a95e037
2 changed files with 432 additions and 187 deletions

View File

@@ -9,23 +9,68 @@ from bs4 import BeautifulSoup
from markitdown import MarkItDown from markitdown import MarkItDown
import json import json
import logging import logging
from io import BytesIO
import re
import tempfile
import shutil
from pathlib import Path
# Configure logging # Configure logging
logging.basicConfig(level=logging.INFO, logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s') format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Try to import Selenium, but make it optional
try:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
SELENIUM_AVAILABLE = True
except ImportError:
SELENIUM_AVAILABLE = False
logger.warning("Selenium not available. Falling back to simple crawler.")
class WebScraper: class WebScraper:
def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None): def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None, use_selenium=False):
self.base_url = base_url self.base_url = base_url
self.domain = urlparse(base_url).netloc self.domain = urlparse(base_url).netloc
self.visited_urls = set() self.visited_urls = set()
self.max_depth = max_depth self.max_depth = max_depth
self.delay = delay self.delay = delay
self.exclude_patterns = exclude_patterns or [] self.exclude_patterns = exclude_patterns or []
self.pages = {} # Dictionary to store URL: HTML content self.pages = {}
self.session = requests.Session() self.use_selenium = use_selenium and SELENIUM_AVAILABLE
if self.use_selenium:
self.setup_selenium()
else:
self.session = requests.Session()
self.base_path = urlparse(base_url).path.rstrip('/')
def setup_selenium(self):
"""Setup Selenium WebDriver with headless Chrome."""
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.set_page_load_timeout(30)
logger.info("Selenium WebDriver initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize Selenium: {e}")
logger.info("Falling back to requests")
self.use_selenium = False
self.session = requests.Session()
def should_exclude(self, url): def should_exclude(self, url):
"""Check if URL should be excluded based on patterns.""" """Check if URL should be excluded based on patterns."""
for pattern in self.exclude_patterns: for pattern in self.exclude_patterns:
@@ -36,48 +81,102 @@ class WebScraper:
def is_valid_url(self, url): def is_valid_url(self, url):
"""Check if the URL is valid and belongs to the same domain.""" """Check if the URL is valid and belongs to the same domain."""
parsed = urlparse(url) parsed = urlparse(url)
return bool(parsed.netloc) and parsed.netloc == self.domain if not (parsed.netloc and parsed.netloc == self.domain):
return False
return parsed.path.startswith(self.base_path)
def get_links(self, url, html): def get_links_selenium(self, url):
"""Extract all links from the HTML content.""" """Extract all links from the page using Selenium."""
try:
self.driver.get(url)
# Wait for page to load
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# Try to wait for main content to load
try:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.TAG_NAME, "main"))
)
except TimeoutException:
pass
# Get page source after JavaScript execution
html = self.driver.page_source
self.pages[url] = html
# Extract links
links = set()
for a_tag in self.driver.find_elements(By.TAG_NAME, "a"):
href = a_tag.get_attribute("href")
if href:
full_url = urljoin(url, href)
if self.is_valid_url(full_url) and not self.should_exclude(full_url):
links.add(full_url)
return list(links), html
except Exception as e:
logger.error(f"Error getting links with Selenium from {url}: {e}")
return [], ""
def get_links_requests(self, url, html):
"""Extract all links from the HTML content using requests."""
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True): for a_tag in soup.find_all('a', href=True):
href = a_tag['href'] href = a_tag['href']
# Handle relative URLs
full_url = urljoin(url, href) full_url = urljoin(url, href)
# Filter URLs to only include those from the same domain
if self.is_valid_url(full_url) and not self.should_exclude(full_url): if self.is_valid_url(full_url) and not self.should_exclude(full_url):
yield full_url links.add(full_url)
return list(links)
def get_page_requests(self, url):
"""Get page content using requests."""
try:
response = self.session.get(url, timeout=10)
if response.status_code == 200:
return response.text
else:
logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}")
return None
except Exception as e:
logger.error(f"Error fetching {url} with requests: {e}")
return None
def crawl(self, url=None, depth=0): def crawl(self, url=None, depth=0):
"""Crawl the website starting from the URL up to max_depth.""" """Crawl the website starting from the URL up to max_depth."""
if url is None: if url is None:
url = self.base_url url = self.base_url
# Stop if we've reached max depth or already visited this URL
if depth > self.max_depth or url in self.visited_urls: if depth > self.max_depth or url in self.visited_urls:
return return
# Mark this URL as visited
self.visited_urls.add(url) self.visited_urls.add(url)
try: try:
logger.info(f"Crawling: {url} (Depth: {depth})") logger.info(f"Crawling: {url} (Depth: {depth})")
response = self.session.get(url, timeout=10)
if response.status_code == 200: if self.use_selenium:
# Store the HTML content links, html = self.get_links_selenium(url)
self.pages[url] = response.text if html:
self.pages[url] = html
# Extract and follow links
if depth < self.max_depth:
for link in self.get_links(url, response.text):
# Be nice to the server - add delay
time.sleep(self.delay)
self.crawl(link, depth + 1)
else: else:
logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}") html = self.get_page_requests(url)
if html:
self.pages[url] = html
links = self.get_links_requests(url, html)
else:
links = []
# Follow links
if depth < self.max_depth and links:
logger.info(f"Found {len(links)} links to follow from {url}")
for link in links:
time.sleep(self.delay)
self.crawl(link, depth + 1)
except Exception as e: except Exception as e:
logger.error(f"Error crawling {url}: {e}") logger.error(f"Error crawling {url}: {e}")
@@ -86,8 +185,10 @@ class WebScraper:
return self.pages return self.pages
def close(self): def close(self):
"""Close the requests session.""" """Close the requests session or Selenium driver."""
if hasattr(self, 'session') and self.session: if self.use_selenium and hasattr(self, 'driver'):
self.driver.quit()
elif hasattr(self, 'session'):
self.session.close() self.session.close()
@@ -163,102 +264,94 @@ class OpenWebUIUploader:
logger.error(f"Error creating knowledge base: {e}") logger.error(f"Error creating knowledge base: {e}")
raise raise
def upload_file(self, kb_id, content, filename, content_type="text/markdown"): def validate_content(self, content, filename):
"""Upload a file to the knowledge base.""" """Validate that content is not empty and has sufficient meaningful text."""
if not content or not content.strip():
return False, "Content is empty"
# Count meaningful lines (not just headers or empty lines)
lines = [line.strip() for line in content.split('\n') if line.strip()]
meaningful_lines = [line for line in lines if not line.startswith('#') and len(line) > 20]
if len(meaningful_lines) < 3:
return False, f"Not enough meaningful content ({len(meaningful_lines)} lines)"
# Count words in clean content
clean_content = re.sub(r'#.*?\n', '', content)
clean_content = re.sub(r'```.*?```', '', clean_content, flags=re.DOTALL)
clean_content = re.sub(r'`.*?`', '', clean_content)
clean_content = re.sub(r'\*.*?\*', '', clean_content)
clean_content = clean_content.strip()
words = clean_content.split()
if len(words) < 50:
return False, f"Content too short ({len(words)} words after cleaning)"
return True, "Valid content"
def upload_file_from_path(self, kb_id, file_path, filename, content_type="text/markdown"):
"""Upload a file to the knowledge base from a file path."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
return {"status": "error", "reason": f"read_error: {str(e)}"}
is_valid, validation_msg = self.validate_content(content, filename)
if not is_valid:
logger.warning(f"Skipping invalid file {filename}: {validation_msg}")
return {"status": "skipped", "reason": validation_msg}
upload_endpoint = f"{self.base_url}/api/v1/files/" upload_endpoint = f"{self.base_url}/api/v1/files/"
# Create a temporary file for the upload
temp_file_path = f"/tmp/{filename}"
with open(temp_file_path, 'w') as f:
f.write(content)
try: try:
# Use context manager for file upload request with open(file_path, 'rb') as f:
with open(temp_file_path, 'rb') as f:
files = {'file': (filename, f, content_type)} files = {'file': (filename, f, content_type)}
with self.session.post( upload_response = self.session.post(
upload_endpoint, upload_endpoint,
headers={"Authorization": f"Bearer {self.api_token}"}, headers={"Authorization": f"Bearer {self.api_token}"},
files=files files=files
) as upload_response: )
upload_response.raise_for_status() upload_response.raise_for_status()
file_id = upload_response.json().get('id') file_id = upload_response.json().get('id')
if not file_id:
raise ValueError("No file ID returned from upload")
# Add the file to the knowledge base
add_file_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add" add_file_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
with self.session.post( add_response = self.session.post(
add_file_endpoint, add_file_endpoint,
headers={ headers={
"Authorization": f"Bearer {self.api_token}", "Authorization": f"Bearer {self.api_token}",
"Content-Type": "application/json" "Content-Type": "application/json"
}, },
json={'file_id': file_id} json={'file_id': file_id}
) as add_response: )
add_response.raise_for_status()
return add_response.json() if add_response.status_code == 400:
error_msg = add_response.text
if "empty" in error_msg.lower():
logger.warning(f"OpenWebUI rejected file {filename} as empty content")
try:
delete_endpoint = f"{self.base_url}/api/v1/files/{file_id}"
self.session.delete(delete_endpoint)
except:
pass
return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"}
else:
add_response.raise_for_status()
add_response.raise_for_status()
return add_response.json()
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.error(f"Error uploading file: {e}") logger.error(f"Error uploading file {filename}: {e}")
raise if hasattr(e, 'response') and e.response is not None:
finally: if e.response.status_code == 400 and "empty" in str(e.response.text).lower():
# Clean up the temporary file logger.warning(f"OpenWebUI rejected file {filename} as empty content")
if os.path.exists(temp_file_path): return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"}
os.unlink(temp_file_path) return {"status": "error", "reason": f"upload_error: {str(e)}"}
def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"):
"""Update an existing file in the knowledge base."""
# First upload the new version of the file
upload_endpoint = f"{self.base_url}/api/v1/files/"
# Create a temporary file for the upload
temp_file_path = f"/tmp/{filename}"
with open(temp_file_path, 'w') as f:
f.write(content)
try:
# Upload the new file
with open(temp_file_path, 'rb') as f:
files = {'file': (filename, f, content_type)}
with self.session.post(
upload_endpoint,
headers={"Authorization": f"Bearer {self.api_token}"},
files=files
) as upload_response:
upload_response.raise_for_status()
new_file_id = upload_response.json().get('id')
# Remove the old file from the knowledge base
remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove"
with self.session.post(
remove_endpoint,
headers={
"Authorization": f"Bearer {self.api_token}",
"Content-Type": "application/json"
},
json={'file_id': existing_file_id}
) as remove_response:
remove_response.raise_for_status()
# Add the new file to the knowledge base
add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
with self.session.post(
add_endpoint,
headers={
"Authorization": f"Bearer {self.api_token}",
"Content-Type": "application/json"
},
json={'file_id': new_file_id}
) as add_response:
add_response.raise_for_status()
return add_response.json()
except requests.exceptions.RequestException as e:
logger.error(f"Error updating file: {e}")
raise
finally:
# Clean up the temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
def close(self): def close(self):
"""Close the requests session.""" """Close the requests session."""
@@ -266,24 +359,82 @@ class OpenWebUIUploader:
self.session.close() self.session.close()
def extract_clean_text(html_content, url):
"""Extract clean, meaningful text from HTML."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove unwanted elements
for element in soup(["script", "style", "nav", "header", "footer", "aside",
"meta", "link", "button", "form", "input", "select"]):
element.decompose()
# Try different content selectors
content_selectors = [
'main', 'article', '.content', '#content', '.main', '#main',
'.documentation', '#documentation', '.doc', '#doc',
'.page', '#page', '.post', '#post',
'body'
]
content_element = None
for selector in content_selectors:
content_element = soup.select_one(selector)
if content_element:
logger.info(f"Found content using selector: {selector}")
break
if not content_element:
content_element = soup
# Extract text with structure
text_parts = []
for element in content_element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'div']):
text = element.get_text(strip=True)
if text and len(text) > 10:
if element.name.startswith('h'):
level = int(element.name[1])
text_parts.append(f"{'#' * level} {text}")
elif element.name == 'li':
text_parts.append(f"- {text}")
else:
text_parts.append(text)
# Fallback to general text extraction
if len(text_parts) < 3:
text = content_element.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text_parts = [chunk for chunk in chunks if chunk and len(chunk) > 20]
if text_parts:
content = f"# Source: {url}\n\n" + "\n\n".join(text_parts)
logger.info(f"Extracted {len(text_parts)} content blocks, {len(content)} total chars")
return content
else:
logger.warning(f"No meaningful content extracted from {url}")
return f"# Source: {url}\n\n*No meaningful text content could be extracted from this page.*"
def convert_to_markdown(html_content, url): def convert_to_markdown(html_content, url):
"""Convert HTML content to Markdown using MarkItDown.""" """Convert HTML to Markdown with robust fallbacks."""
clean_text = extract_clean_text(html_content, url)
if len(clean_text.strip()) > 200:
return clean_text
try: try:
md = MarkItDown() md = MarkItDown()
# Use BytesIO to provide a binary stream to convert_stream
from io import BytesIO
html_bytes = BytesIO(html_content.encode('utf-8')) html_bytes = BytesIO(html_content.encode('utf-8'))
# Convert the HTML to Markdown
result = md.convert_stream(html_bytes, mime_type='text/html') result = md.convert_stream(html_bytes, mime_type='text/html')
# Add a header with the source URL if result and hasattr(result, 'text_content') and result.text_content:
markdown_with_header = f"# {url}\n\n{result.text_content}" markdown_content = result.text_content.strip()
return markdown_with_header if markdown_content and len(markdown_content) > 200:
return f"# Source: {url}\n\n{markdown_content}"
except Exception as e: except Exception as e:
logger.error(f"Error converting to markdown: {e}") logger.warning(f"MarkItDown failed for {url}: {e}")
return f"# {url}\n\nError converting content: {str(e)}"
return clean_text
def is_valid_json(content): def is_valid_json(content):
@@ -295,6 +446,64 @@ def is_valid_json(content):
return False return False
def create_unique_filename(url):
"""Create a unique filename from URL including fragment."""
parsed = urlparse(url)
path = parsed.path
if not path or path == '/':
path = 'index'
fragment = parsed.fragment
if fragment:
fragment_clean = re.sub(r'[^a-zA-Z0-9]', '_', fragment)
filename = f"{path.strip('/')}_{fragment_clean}"
else:
filename = path.strip('/')
filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
if len(filename) < 5:
domain_part = re.sub(r'[^a-zA-Z0-9]', '_', parsed.netloc)
filename = f"{domain_part}_{filename}"
if not filename.endswith('.md'):
filename = f"{filename}.md"
return filename
def save_files_to_temp_dir(processed_files, temp_dir):
"""Save processed files to temporary directory."""
saved_files = []
for file_info in processed_files:
try:
file_path = os.path.join(temp_dir, file_info['filename'])
counter = 1
original_path = file_path
while os.path.exists(file_path):
name, ext = os.path.splitext(original_path)
file_path = f"{name}_{counter}{ext}"
counter += 1
with open(file_path, 'w', encoding='utf-8') as f:
f.write(file_info['content'])
saved_files.append({
'file_path': file_path,
'filename': os.path.basename(file_path),
'content_type': file_info['content_type'],
'url': file_info['url']
})
logger.info(f"Saved file to temp directory: {os.path.basename(file_path)}")
except Exception as e:
logger.error(f"Error saving file {file_info['filename']}: {e}")
return saved_files
def main(): def main():
parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base') parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token') parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
@@ -308,26 +517,36 @@ def main():
parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints') parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base') parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base') parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
parser.add_argument('--min-content-length', type=int, default=200, help='Minimum content length to include (default: 200 characters)')
parser.add_argument('--keep-temp-files', action='store_true', help='Keep temporary files for debugging')
parser.add_argument('--use-selenium', action='store_true', help='Use Selenium for JavaScript-rendered sites')
args = parser.parse_args() args = parser.parse_args()
# Check for conflicting options
if args.update and args.skip_existing: if args.update and args.skip_existing:
logger.error("Cannot use both --update and --skip-existing flags at the same time") logger.error("Cannot use both --update and --skip-existing flags at the same time")
return 1 return 1
# Initialize resources that need to be closed # Check if Selenium is requested but not available
if args.use_selenium and not SELENIUM_AVAILABLE:
logger.warning("Selenium requested but not available. Install with: pip install selenium webdriver-manager")
logger.warning("Falling back to simple crawler.")
args.use_selenium = False
scraper = None scraper = None
uploader = None uploader = None
temp_dir = None
try: try:
# 1. Crawl the website
logger.info(f"Starting web crawl of {args.website_url} to depth {args.depth}") logger.info(f"Starting web crawl of {args.website_url} to depth {args.depth}")
logger.info(f"Using {'Selenium' if args.use_selenium else 'simple'} crawler")
scraper = WebScraper( scraper = WebScraper(
base_url=args.website_url, base_url=args.website_url,
max_depth=args.depth, max_depth=args.depth,
delay=args.delay, delay=args.delay,
exclude_patterns=args.exclude or [] exclude_patterns=args.exclude or [],
use_selenium=args.use_selenium
) )
scraper.crawl() scraper.crawl()
@@ -338,46 +557,49 @@ def main():
logger.error("No pages were crawled. Exiting.") logger.error("No pages were crawled. Exiting.")
return 1 return 1
# 2. Process content (convert HTML to Markdown or handle JSON)
logger.info("Processing crawled content") logger.info("Processing crawled content")
processed_files = [] processed_files = []
empty_files = 0
for url, html_content in crawled_pages.items(): for url, html_content in crawled_pages.items():
# For JSON content, preserve it as JSON if not html_content or len(html_content.strip()) < 100:
logger.warning(f"Skipping empty page: {url}")
empty_files += 1
continue
if url.endswith('.json') or (is_valid_json(html_content) and args.include_json): if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
if is_valid_json(html_content): if is_valid_json(html_content):
try: try:
json_obj = json.loads(html_content) json_obj = json.loads(html_content)
pretty_json = json.dumps(json_obj, indent=2) pretty_json = json.dumps(json_obj, indent=2)
# Create filename for JSON file if len(pretty_json.strip()) >= args.min_content_length:
parsed_url = urlparse(url) filename = create_unique_filename(url)
filename = f"{parsed_url.netloc}{parsed_url.path}" if not filename.endswith('.json'):
filename = filename.replace('/', '_').replace('.', '_') filename = f"{filename}.json"
if not filename.endswith('.json'):
filename = f"{filename}.json" processed_files.append({
'content': pretty_json,
processed_files.append({ 'content_type': 'application/json',
'content': pretty_json, 'filename': filename,
'content_type': 'application/json', 'url': url
'filename': filename, })
'url': url logger.info(f"Processed JSON content from {url}")
}) else:
logger.info(f"Processed JSON content from {url}") logger.warning(f"Skipping JSON file {url} - content too short")
empty_files += 1
continue continue
except ValueError: except ValueError:
# Not valid JSON despite the extension, fall back to Markdown
pass pass
# For all other content, convert to Markdown
markdown_content = convert_to_markdown(html_content, url) markdown_content = convert_to_markdown(html_content, url)
# Create a safe filename if not markdown_content or len(markdown_content.strip()) < args.min_content_length:
parsed_url = urlparse(url) logger.warning(f"Skipping {url} - no extractable content found after conversion")
filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_') empty_files += 1
if not filename.endswith('.md'): continue
filename = f"{filename}.md"
filename = create_unique_filename(url)
processed_files.append({ processed_files.append({
'content': markdown_content, 'content': markdown_content,
'content_type': 'text/markdown', 'content_type': 'text/markdown',
@@ -385,10 +607,38 @@ def main():
'url': url 'url': url
}) })
logger.info(f"Processed {len(processed_files)} files") logger.info(f"Processed {len(processed_files)} files, skipped {empty_files} empty files")
if not processed_files:
logger.error("No files with valid content were processed. Exiting.")
return 1
script_dir = Path(__file__).parent
temp_dir = script_dir / "temp_webscraper_files"
temp_dir.mkdir(exist_ok=True)
logger.info(f"Created temporary directory: {temp_dir}")
saved_files = save_files_to_temp_dir(processed_files, temp_dir)
logger.info(f"Saved {len(saved_files)} files to temporary directory")
logger.info("=== DEBUG: File Content Analysis ===")
for file_info in saved_files:
try:
with open(file_info['file_path'], 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
meaningful_lines = [line for line in lines if line.strip() and not line.startswith('#') and len(line.strip()) > 20]
logger.info(f"File: {file_info['filename']}")
logger.info(f" Total size: {len(content)} chars")
logger.info(f" Total lines: {len(lines)}")
logger.info(f" Meaningful lines: {len(meaningful_lines)}")
if meaningful_lines:
logger.info(f" First meaningful line: {meaningful_lines[0][:100]}{'...' if len(meaningful_lines[0]) > 100 else ''}")
except Exception as e:
logger.error(f"Error reading saved file {file_info['filename']}: {e}")
logger.info("=== END DEBUG ===")
# 3. Upload to Open WebUI
# First check if a knowledge base with the specified name already exists
uploader = OpenWebUIUploader(args.base_url, args.token) uploader = OpenWebUIUploader(args.base_url, args.token)
existing_kb = uploader.get_knowledge_base_by_name(args.kb_name) existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
@@ -396,7 +646,6 @@ def main():
kb_id = existing_kb.get('id') kb_id = existing_kb.get('id')
logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}") logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
else: else:
# Create a new knowledge base if none exists with that name
logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI") logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose) kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
kb_id = kb.get('id') kb_id = kb.get('id')
@@ -405,61 +654,46 @@ def main():
return 1 return 1
logger.info(f"Created knowledge base with ID: {kb_id}") logger.info(f"Created knowledge base with ID: {kb_id}")
# 4. Upload each file
success_count = 0 success_count = 0
skip_count = 0 skip_count = 0
update_count = 0 update_count = 0
error_count = 0 error_count = 0
empty_skip_count = 0
for file_info in processed_files: for file_info in saved_files:
try: try:
filename = file_info['filename'] filename = file_info['filename']
file_path = file_info['file_path']
existing_file_id = uploader.file_exists_in_kb(kb_id, filename) existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
# Handle existing files based on options if existing_file_id and args.skip_existing:
if existing_file_id: logger.info(f"Skipping existing file: {filename}")
if args.skip_existing: skip_count += 1
logger.info(f"Skipping existing file: {filename}") continue
skip_count += 1
continue logger.info(f"Uploading file: {filename}")
elif args.update: result = uploader.upload_file_from_path(
logger.info(f"Updating existing file: {filename}") kb_id,
uploader.update_file( file_path,
kb_id, filename,
existing_file_id, file_info['content_type']
file_info['content'], )
filename, if isinstance(result, dict) and result.get('status') in ['skipped', 'error']:
file_info['content_type'] if result.get('status') == 'skipped':
) empty_skip_count += 1
update_count += 1
else: else:
# Default behavior: add as new file error_count += 1
logger.info(f"Adding duplicate file (existing file will remain): {filename}") logger.warning(f"Failed to upload {filename}: {result.get('reason')}")
uploader.upload_file(
kb_id,
file_info['content'],
filename,
file_info['content_type']
)
success_count += 1
else: else:
# New file
logger.info(f"Uploading new file: {filename}")
uploader.upload_file(
kb_id,
file_info['content'],
filename,
file_info['content_type']
)
success_count += 1 success_count += 1
# Add a small delay between uploads
time.sleep(0.5) time.sleep(0.5)
except Exception as e: except Exception as e:
logger.error(f"Failed to process {file_info['filename']}: {e}") logger.error(f"Failed to process {file_info['filename']}: {e}")
error_count += 1 error_count += 1
logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors") logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {empty_skip_count} empty/invalid files skipped, {error_count} errors")
return 0 return 0
@@ -467,11 +701,20 @@ def main():
logger.error(f"An unexpected error occurred: {e}") logger.error(f"An unexpected error occurred: {e}")
return 1 return 1
finally: finally:
# Ensure all resources are properly closed
if scraper: if scraper:
scraper.close() scraper.close()
if uploader: if uploader:
uploader.close() uploader.close()
if temp_dir and temp_dir.exists():
if args.keep_temp_files:
logger.info(f"Keeping temporary files in: {temp_dir}")
else:
try:
shutil.rmtree(temp_dir)
logger.info("Cleaned up temporary directory")
except Exception as e:
logger.warning(f"Failed to clean up temporary directory {temp_dir}: {e}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,3 +1,5 @@
requests requests
beautifulsoup4 beautifulsoup4
markitdown[all] markitdown[all]
selenium
webdriver-manager