Fix when Crawling JS Sites
This commit is contained in:
@@ -9,23 +9,68 @@ from bs4 import BeautifulSoup
|
|||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO,
|
logging.basicConfig(level=logging.INFO,
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s')
|
format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import Selenium, but make it optional
|
||||||
|
try:
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.common.exceptions import TimeoutException, WebDriverException
|
||||||
|
SELENIUM_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
SELENIUM_AVAILABLE = False
|
||||||
|
logger.warning("Selenium not available. Falling back to simple crawler.")
|
||||||
|
|
||||||
class WebScraper:
|
class WebScraper:
|
||||||
def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None):
|
def __init__(self, base_url, max_depth=2, delay=1.0, exclude_patterns=None, use_selenium=False):
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.domain = urlparse(base_url).netloc
|
self.domain = urlparse(base_url).netloc
|
||||||
self.visited_urls = set()
|
self.visited_urls = set()
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
self.delay = delay
|
self.delay = delay
|
||||||
self.exclude_patterns = exclude_patterns or []
|
self.exclude_patterns = exclude_patterns or []
|
||||||
self.pages = {} # Dictionary to store URL: HTML content
|
self.pages = {}
|
||||||
self.session = requests.Session()
|
self.use_selenium = use_selenium and SELENIUM_AVAILABLE
|
||||||
|
|
||||||
|
if self.use_selenium:
|
||||||
|
self.setup_selenium()
|
||||||
|
else:
|
||||||
|
self.session = requests.Session()
|
||||||
|
|
||||||
|
self.base_path = urlparse(base_url).path.rstrip('/')
|
||||||
|
|
||||||
|
def setup_selenium(self):
|
||||||
|
"""Setup Selenium WebDriver with headless Chrome."""
|
||||||
|
try:
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
chrome_options.add_argument("--disable-gpu")
|
||||||
|
chrome_options.add_argument("--window-size=1920,1080")
|
||||||
|
chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||||
|
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
self.driver.set_page_load_timeout(30)
|
||||||
|
logger.info("Selenium WebDriver initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize Selenium: {e}")
|
||||||
|
logger.info("Falling back to requests")
|
||||||
|
self.use_selenium = False
|
||||||
|
self.session = requests.Session()
|
||||||
|
|
||||||
def should_exclude(self, url):
|
def should_exclude(self, url):
|
||||||
"""Check if URL should be excluded based on patterns."""
|
"""Check if URL should be excluded based on patterns."""
|
||||||
for pattern in self.exclude_patterns:
|
for pattern in self.exclude_patterns:
|
||||||
@@ -36,48 +81,102 @@ class WebScraper:
|
|||||||
def is_valid_url(self, url):
|
def is_valid_url(self, url):
|
||||||
"""Check if the URL is valid and belongs to the same domain."""
|
"""Check if the URL is valid and belongs to the same domain."""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
return bool(parsed.netloc) and parsed.netloc == self.domain
|
if not (parsed.netloc and parsed.netloc == self.domain):
|
||||||
|
return False
|
||||||
|
return parsed.path.startswith(self.base_path)
|
||||||
|
|
||||||
def get_links(self, url, html):
|
def get_links_selenium(self, url):
|
||||||
"""Extract all links from the HTML content."""
|
"""Extract all links from the page using Selenium."""
|
||||||
|
try:
|
||||||
|
self.driver.get(url)
|
||||||
|
# Wait for page to load
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Try to wait for main content to load
|
||||||
|
try:
|
||||||
|
WebDriverWait(self.driver, 5).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "main"))
|
||||||
|
)
|
||||||
|
except TimeoutException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Get page source after JavaScript execution
|
||||||
|
html = self.driver.page_source
|
||||||
|
self.pages[url] = html
|
||||||
|
|
||||||
|
# Extract links
|
||||||
|
links = set()
|
||||||
|
for a_tag in self.driver.find_elements(By.TAG_NAME, "a"):
|
||||||
|
href = a_tag.get_attribute("href")
|
||||||
|
if href:
|
||||||
|
full_url = urljoin(url, href)
|
||||||
|
if self.is_valid_url(full_url) and not self.should_exclude(full_url):
|
||||||
|
links.add(full_url)
|
||||||
|
|
||||||
|
return list(links), html
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting links with Selenium from {url}: {e}")
|
||||||
|
return [], ""
|
||||||
|
|
||||||
|
def get_links_requests(self, url, html):
|
||||||
|
"""Extract all links from the HTML content using requests."""
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
links = set()
|
||||||
for a_tag in soup.find_all('a', href=True):
|
for a_tag in soup.find_all('a', href=True):
|
||||||
href = a_tag['href']
|
href = a_tag['href']
|
||||||
# Handle relative URLs
|
|
||||||
full_url = urljoin(url, href)
|
full_url = urljoin(url, href)
|
||||||
# Filter URLs to only include those from the same domain
|
|
||||||
if self.is_valid_url(full_url) and not self.should_exclude(full_url):
|
if self.is_valid_url(full_url) and not self.should_exclude(full_url):
|
||||||
yield full_url
|
links.add(full_url)
|
||||||
|
return list(links)
|
||||||
|
|
||||||
|
def get_page_requests(self, url):
|
||||||
|
"""Get page content using requests."""
|
||||||
|
try:
|
||||||
|
response = self.session.get(url, timeout=10)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.text
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching {url} with requests: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def crawl(self, url=None, depth=0):
|
def crawl(self, url=None, depth=0):
|
||||||
"""Crawl the website starting from the URL up to max_depth."""
|
"""Crawl the website starting from the URL up to max_depth."""
|
||||||
if url is None:
|
if url is None:
|
||||||
url = self.base_url
|
url = self.base_url
|
||||||
|
|
||||||
# Stop if we've reached max depth or already visited this URL
|
|
||||||
if depth > self.max_depth or url in self.visited_urls:
|
if depth > self.max_depth or url in self.visited_urls:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Mark this URL as visited
|
|
||||||
self.visited_urls.add(url)
|
self.visited_urls.add(url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Crawling: {url} (Depth: {depth})")
|
logger.info(f"Crawling: {url} (Depth: {depth})")
|
||||||
response = self.session.get(url, timeout=10)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
if self.use_selenium:
|
||||||
# Store the HTML content
|
links, html = self.get_links_selenium(url)
|
||||||
self.pages[url] = response.text
|
if html:
|
||||||
|
self.pages[url] = html
|
||||||
# Extract and follow links
|
|
||||||
if depth < self.max_depth:
|
|
||||||
for link in self.get_links(url, response.text):
|
|
||||||
# Be nice to the server - add delay
|
|
||||||
time.sleep(self.delay)
|
|
||||||
self.crawl(link, depth + 1)
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Failed to fetch {url}: HTTP {response.status_code}")
|
html = self.get_page_requests(url)
|
||||||
|
if html:
|
||||||
|
self.pages[url] = html
|
||||||
|
links = self.get_links_requests(url, html)
|
||||||
|
else:
|
||||||
|
links = []
|
||||||
|
|
||||||
|
# Follow links
|
||||||
|
if depth < self.max_depth and links:
|
||||||
|
logger.info(f"Found {len(links)} links to follow from {url}")
|
||||||
|
for link in links:
|
||||||
|
time.sleep(self.delay)
|
||||||
|
self.crawl(link, depth + 1)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error crawling {url}: {e}")
|
logger.error(f"Error crawling {url}: {e}")
|
||||||
|
|
||||||
@@ -86,8 +185,10 @@ class WebScraper:
|
|||||||
return self.pages
|
return self.pages
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""Close the requests session."""
|
"""Close the requests session or Selenium driver."""
|
||||||
if hasattr(self, 'session') and self.session:
|
if self.use_selenium and hasattr(self, 'driver'):
|
||||||
|
self.driver.quit()
|
||||||
|
elif hasattr(self, 'session'):
|
||||||
self.session.close()
|
self.session.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -163,102 +264,94 @@ class OpenWebUIUploader:
|
|||||||
logger.error(f"Error creating knowledge base: {e}")
|
logger.error(f"Error creating knowledge base: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def upload_file(self, kb_id, content, filename, content_type="text/markdown"):
|
def validate_content(self, content, filename):
|
||||||
"""Upload a file to the knowledge base."""
|
"""Validate that content is not empty and has sufficient meaningful text."""
|
||||||
|
if not content or not content.strip():
|
||||||
|
return False, "Content is empty"
|
||||||
|
|
||||||
|
# Count meaningful lines (not just headers or empty lines)
|
||||||
|
lines = [line.strip() for line in content.split('\n') if line.strip()]
|
||||||
|
meaningful_lines = [line for line in lines if not line.startswith('#') and len(line) > 20]
|
||||||
|
|
||||||
|
if len(meaningful_lines) < 3:
|
||||||
|
return False, f"Not enough meaningful content ({len(meaningful_lines)} lines)"
|
||||||
|
|
||||||
|
# Count words in clean content
|
||||||
|
clean_content = re.sub(r'#.*?\n', '', content)
|
||||||
|
clean_content = re.sub(r'```.*?```', '', clean_content, flags=re.DOTALL)
|
||||||
|
clean_content = re.sub(r'`.*?`', '', clean_content)
|
||||||
|
clean_content = re.sub(r'\*.*?\*', '', clean_content)
|
||||||
|
clean_content = clean_content.strip()
|
||||||
|
|
||||||
|
words = clean_content.split()
|
||||||
|
if len(words) < 50:
|
||||||
|
return False, f"Content too short ({len(words)} words after cleaning)"
|
||||||
|
|
||||||
|
return True, "Valid content"
|
||||||
|
|
||||||
|
def upload_file_from_path(self, kb_id, file_path, filename, content_type="text/markdown"):
|
||||||
|
"""Upload a file to the knowledge base from a file path."""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error reading file {file_path}: {e}")
|
||||||
|
return {"status": "error", "reason": f"read_error: {str(e)}"}
|
||||||
|
|
||||||
|
is_valid, validation_msg = self.validate_content(content, filename)
|
||||||
|
if not is_valid:
|
||||||
|
logger.warning(f"Skipping invalid file {filename}: {validation_msg}")
|
||||||
|
return {"status": "skipped", "reason": validation_msg}
|
||||||
|
|
||||||
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
||||||
|
|
||||||
# Create a temporary file for the upload
|
|
||||||
temp_file_path = f"/tmp/{filename}"
|
|
||||||
with open(temp_file_path, 'w') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use context manager for file upload request
|
with open(file_path, 'rb') as f:
|
||||||
with open(temp_file_path, 'rb') as f:
|
|
||||||
files = {'file': (filename, f, content_type)}
|
files = {'file': (filename, f, content_type)}
|
||||||
with self.session.post(
|
upload_response = self.session.post(
|
||||||
upload_endpoint,
|
upload_endpoint,
|
||||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||||
files=files
|
files=files
|
||||||
) as upload_response:
|
)
|
||||||
upload_response.raise_for_status()
|
upload_response.raise_for_status()
|
||||||
file_id = upload_response.json().get('id')
|
file_id = upload_response.json().get('id')
|
||||||
|
|
||||||
|
if not file_id:
|
||||||
|
raise ValueError("No file ID returned from upload")
|
||||||
|
|
||||||
# Add the file to the knowledge base
|
|
||||||
add_file_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
|
add_file_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
|
||||||
with self.session.post(
|
add_response = self.session.post(
|
||||||
add_file_endpoint,
|
add_file_endpoint,
|
||||||
headers={
|
headers={
|
||||||
"Authorization": f"Bearer {self.api_token}",
|
"Authorization": f"Bearer {self.api_token}",
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
},
|
},
|
||||||
json={'file_id': file_id}
|
json={'file_id': file_id}
|
||||||
) as add_response:
|
)
|
||||||
add_response.raise_for_status()
|
|
||||||
return add_response.json()
|
if add_response.status_code == 400:
|
||||||
|
error_msg = add_response.text
|
||||||
|
if "empty" in error_msg.lower():
|
||||||
|
logger.warning(f"OpenWebUI rejected file {filename} as empty content")
|
||||||
|
try:
|
||||||
|
delete_endpoint = f"{self.base_url}/api/v1/files/{file_id}"
|
||||||
|
self.session.delete(delete_endpoint)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"}
|
||||||
|
else:
|
||||||
|
add_response.raise_for_status()
|
||||||
|
|
||||||
|
add_response.raise_for_status()
|
||||||
|
return add_response.json()
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.error(f"Error uploading file: {e}")
|
logger.error(f"Error uploading file {filename}: {e}")
|
||||||
raise
|
if hasattr(e, 'response') and e.response is not None:
|
||||||
finally:
|
if e.response.status_code == 400 and "empty" in str(e.response.text).lower():
|
||||||
# Clean up the temporary file
|
logger.warning(f"OpenWebUI rejected file {filename} as empty content")
|
||||||
if os.path.exists(temp_file_path):
|
return {"status": "skipped", "reason": "rejected_as_empty_by_openwebui"}
|
||||||
os.unlink(temp_file_path)
|
return {"status": "error", "reason": f"upload_error: {str(e)}"}
|
||||||
|
|
||||||
def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"):
|
|
||||||
"""Update an existing file in the knowledge base."""
|
|
||||||
# First upload the new version of the file
|
|
||||||
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
|
||||||
|
|
||||||
# Create a temporary file for the upload
|
|
||||||
temp_file_path = f"/tmp/{filename}"
|
|
||||||
with open(temp_file_path, 'w') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Upload the new file
|
|
||||||
with open(temp_file_path, 'rb') as f:
|
|
||||||
files = {'file': (filename, f, content_type)}
|
|
||||||
with self.session.post(
|
|
||||||
upload_endpoint,
|
|
||||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
|
||||||
files=files
|
|
||||||
) as upload_response:
|
|
||||||
upload_response.raise_for_status()
|
|
||||||
new_file_id = upload_response.json().get('id')
|
|
||||||
|
|
||||||
# Remove the old file from the knowledge base
|
|
||||||
remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove"
|
|
||||||
with self.session.post(
|
|
||||||
remove_endpoint,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {self.api_token}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={'file_id': existing_file_id}
|
|
||||||
) as remove_response:
|
|
||||||
remove_response.raise_for_status()
|
|
||||||
|
|
||||||
# Add the new file to the knowledge base
|
|
||||||
add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
|
|
||||||
with self.session.post(
|
|
||||||
add_endpoint,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {self.api_token}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={'file_id': new_file_id}
|
|
||||||
) as add_response:
|
|
||||||
add_response.raise_for_status()
|
|
||||||
return add_response.json()
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logger.error(f"Error updating file: {e}")
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
# Clean up the temporary file
|
|
||||||
if os.path.exists(temp_file_path):
|
|
||||||
os.unlink(temp_file_path)
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""Close the requests session."""
|
"""Close the requests session."""
|
||||||
@@ -266,24 +359,82 @@ class OpenWebUIUploader:
|
|||||||
self.session.close()
|
self.session.close()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_clean_text(html_content, url):
|
||||||
|
"""Extract clean, meaningful text from HTML."""
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Remove unwanted elements
|
||||||
|
for element in soup(["script", "style", "nav", "header", "footer", "aside",
|
||||||
|
"meta", "link", "button", "form", "input", "select"]):
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
# Try different content selectors
|
||||||
|
content_selectors = [
|
||||||
|
'main', 'article', '.content', '#content', '.main', '#main',
|
||||||
|
'.documentation', '#documentation', '.doc', '#doc',
|
||||||
|
'.page', '#page', '.post', '#post',
|
||||||
|
'body'
|
||||||
|
]
|
||||||
|
|
||||||
|
content_element = None
|
||||||
|
for selector in content_selectors:
|
||||||
|
content_element = soup.select_one(selector)
|
||||||
|
if content_element:
|
||||||
|
logger.info(f"Found content using selector: {selector}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not content_element:
|
||||||
|
content_element = soup
|
||||||
|
|
||||||
|
# Extract text with structure
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
for element in content_element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'div']):
|
||||||
|
text = element.get_text(strip=True)
|
||||||
|
if text and len(text) > 10:
|
||||||
|
if element.name.startswith('h'):
|
||||||
|
level = int(element.name[1])
|
||||||
|
text_parts.append(f"{'#' * level} {text}")
|
||||||
|
elif element.name == 'li':
|
||||||
|
text_parts.append(f"- {text}")
|
||||||
|
else:
|
||||||
|
text_parts.append(text)
|
||||||
|
|
||||||
|
# Fallback to general text extraction
|
||||||
|
if len(text_parts) < 3:
|
||||||
|
text = content_element.get_text()
|
||||||
|
lines = (line.strip() for line in text.splitlines())
|
||||||
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||||
|
text_parts = [chunk for chunk in chunks if chunk and len(chunk) > 20]
|
||||||
|
|
||||||
|
if text_parts:
|
||||||
|
content = f"# Source: {url}\n\n" + "\n\n".join(text_parts)
|
||||||
|
logger.info(f"Extracted {len(text_parts)} content blocks, {len(content)} total chars")
|
||||||
|
return content
|
||||||
|
else:
|
||||||
|
logger.warning(f"No meaningful content extracted from {url}")
|
||||||
|
return f"# Source: {url}\n\n*No meaningful text content could be extracted from this page.*"
|
||||||
|
|
||||||
|
|
||||||
def convert_to_markdown(html_content, url):
|
def convert_to_markdown(html_content, url):
|
||||||
"""Convert HTML content to Markdown using MarkItDown."""
|
"""Convert HTML to Markdown with robust fallbacks."""
|
||||||
|
clean_text = extract_clean_text(html_content, url)
|
||||||
|
if len(clean_text.strip()) > 200:
|
||||||
|
return clean_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
md = MarkItDown()
|
md = MarkItDown()
|
||||||
|
|
||||||
# Use BytesIO to provide a binary stream to convert_stream
|
|
||||||
from io import BytesIO
|
|
||||||
html_bytes = BytesIO(html_content.encode('utf-8'))
|
html_bytes = BytesIO(html_content.encode('utf-8'))
|
||||||
|
|
||||||
# Convert the HTML to Markdown
|
|
||||||
result = md.convert_stream(html_bytes, mime_type='text/html')
|
result = md.convert_stream(html_bytes, mime_type='text/html')
|
||||||
|
|
||||||
# Add a header with the source URL
|
if result and hasattr(result, 'text_content') and result.text_content:
|
||||||
markdown_with_header = f"# {url}\n\n{result.text_content}"
|
markdown_content = result.text_content.strip()
|
||||||
return markdown_with_header
|
if markdown_content and len(markdown_content) > 200:
|
||||||
|
return f"# Source: {url}\n\n{markdown_content}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error converting to markdown: {e}")
|
logger.warning(f"MarkItDown failed for {url}: {e}")
|
||||||
return f"# {url}\n\nError converting content: {str(e)}"
|
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
|
||||||
def is_valid_json(content):
|
def is_valid_json(content):
|
||||||
@@ -295,6 +446,64 @@ def is_valid_json(content):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def create_unique_filename(url):
|
||||||
|
"""Create a unique filename from URL including fragment."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
|
||||||
|
path = parsed.path
|
||||||
|
if not path or path == '/':
|
||||||
|
path = 'index'
|
||||||
|
|
||||||
|
fragment = parsed.fragment
|
||||||
|
if fragment:
|
||||||
|
fragment_clean = re.sub(r'[^a-zA-Z0-9]', '_', fragment)
|
||||||
|
filename = f"{path.strip('/')}_{fragment_clean}"
|
||||||
|
else:
|
||||||
|
filename = path.strip('/')
|
||||||
|
|
||||||
|
filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
|
||||||
|
|
||||||
|
if len(filename) < 5:
|
||||||
|
domain_part = re.sub(r'[^a-zA-Z0-9]', '_', parsed.netloc)
|
||||||
|
filename = f"{domain_part}_{filename}"
|
||||||
|
|
||||||
|
if not filename.endswith('.md'):
|
||||||
|
filename = f"{filename}.md"
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def save_files_to_temp_dir(processed_files, temp_dir):
|
||||||
|
"""Save processed files to temporary directory."""
|
||||||
|
saved_files = []
|
||||||
|
|
||||||
|
for file_info in processed_files:
|
||||||
|
try:
|
||||||
|
file_path = os.path.join(temp_dir, file_info['filename'])
|
||||||
|
|
||||||
|
counter = 1
|
||||||
|
original_path = file_path
|
||||||
|
while os.path.exists(file_path):
|
||||||
|
name, ext = os.path.splitext(original_path)
|
||||||
|
file_path = f"{name}_{counter}{ext}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(file_info['content'])
|
||||||
|
|
||||||
|
saved_files.append({
|
||||||
|
'file_path': file_path,
|
||||||
|
'filename': os.path.basename(file_path),
|
||||||
|
'content_type': file_info['content_type'],
|
||||||
|
'url': file_info['url']
|
||||||
|
})
|
||||||
|
logger.info(f"Saved file to temp directory: {os.path.basename(file_path)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving file {file_info['filename']}: {e}")
|
||||||
|
|
||||||
|
return saved_files
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
|
parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
|
||||||
parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
|
parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
|
||||||
@@ -308,26 +517,36 @@ def main():
|
|||||||
parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
|
parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
|
||||||
parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
|
parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
|
||||||
parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
|
parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
|
||||||
|
parser.add_argument('--min-content-length', type=int, default=200, help='Minimum content length to include (default: 200 characters)')
|
||||||
|
parser.add_argument('--keep-temp-files', action='store_true', help='Keep temporary files for debugging')
|
||||||
|
parser.add_argument('--use-selenium', action='store_true', help='Use Selenium for JavaScript-rendered sites')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Check for conflicting options
|
|
||||||
if args.update and args.skip_existing:
|
if args.update and args.skip_existing:
|
||||||
logger.error("Cannot use both --update and --skip-existing flags at the same time")
|
logger.error("Cannot use both --update and --skip-existing flags at the same time")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Initialize resources that need to be closed
|
# Check if Selenium is requested but not available
|
||||||
|
if args.use_selenium and not SELENIUM_AVAILABLE:
|
||||||
|
logger.warning("Selenium requested but not available. Install with: pip install selenium webdriver-manager")
|
||||||
|
logger.warning("Falling back to simple crawler.")
|
||||||
|
args.use_selenium = False
|
||||||
|
|
||||||
scraper = None
|
scraper = None
|
||||||
uploader = None
|
uploader = None
|
||||||
|
temp_dir = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Crawl the website
|
|
||||||
logger.info(f"Starting web crawl of {args.website_url} to depth {args.depth}")
|
logger.info(f"Starting web crawl of {args.website_url} to depth {args.depth}")
|
||||||
|
logger.info(f"Using {'Selenium' if args.use_selenium else 'simple'} crawler")
|
||||||
|
|
||||||
scraper = WebScraper(
|
scraper = WebScraper(
|
||||||
base_url=args.website_url,
|
base_url=args.website_url,
|
||||||
max_depth=args.depth,
|
max_depth=args.depth,
|
||||||
delay=args.delay,
|
delay=args.delay,
|
||||||
exclude_patterns=args.exclude or []
|
exclude_patterns=args.exclude or [],
|
||||||
|
use_selenium=args.use_selenium
|
||||||
)
|
)
|
||||||
scraper.crawl()
|
scraper.crawl()
|
||||||
|
|
||||||
@@ -338,46 +557,49 @@ def main():
|
|||||||
logger.error("No pages were crawled. Exiting.")
|
logger.error("No pages were crawled. Exiting.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# 2. Process content (convert HTML to Markdown or handle JSON)
|
|
||||||
logger.info("Processing crawled content")
|
logger.info("Processing crawled content")
|
||||||
processed_files = []
|
processed_files = []
|
||||||
|
empty_files = 0
|
||||||
|
|
||||||
for url, html_content in crawled_pages.items():
|
for url, html_content in crawled_pages.items():
|
||||||
# For JSON content, preserve it as JSON
|
if not html_content or len(html_content.strip()) < 100:
|
||||||
|
logger.warning(f"Skipping empty page: {url}")
|
||||||
|
empty_files += 1
|
||||||
|
continue
|
||||||
|
|
||||||
if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
|
if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
|
||||||
if is_valid_json(html_content):
|
if is_valid_json(html_content):
|
||||||
try:
|
try:
|
||||||
json_obj = json.loads(html_content)
|
json_obj = json.loads(html_content)
|
||||||
pretty_json = json.dumps(json_obj, indent=2)
|
pretty_json = json.dumps(json_obj, indent=2)
|
||||||
|
|
||||||
# Create filename for JSON file
|
if len(pretty_json.strip()) >= args.min_content_length:
|
||||||
parsed_url = urlparse(url)
|
filename = create_unique_filename(url)
|
||||||
filename = f"{parsed_url.netloc}{parsed_url.path}"
|
if not filename.endswith('.json'):
|
||||||
filename = filename.replace('/', '_').replace('.', '_')
|
filename = f"{filename}.json"
|
||||||
if not filename.endswith('.json'):
|
|
||||||
filename = f"{filename}.json"
|
processed_files.append({
|
||||||
|
'content': pretty_json,
|
||||||
processed_files.append({
|
'content_type': 'application/json',
|
||||||
'content': pretty_json,
|
'filename': filename,
|
||||||
'content_type': 'application/json',
|
'url': url
|
||||||
'filename': filename,
|
})
|
||||||
'url': url
|
logger.info(f"Processed JSON content from {url}")
|
||||||
})
|
else:
|
||||||
logger.info(f"Processed JSON content from {url}")
|
logger.warning(f"Skipping JSON file {url} - content too short")
|
||||||
|
empty_files += 1
|
||||||
continue
|
continue
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# Not valid JSON despite the extension, fall back to Markdown
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# For all other content, convert to Markdown
|
|
||||||
markdown_content = convert_to_markdown(html_content, url)
|
markdown_content = convert_to_markdown(html_content, url)
|
||||||
|
|
||||||
# Create a safe filename
|
if not markdown_content or len(markdown_content.strip()) < args.min_content_length:
|
||||||
parsed_url = urlparse(url)
|
logger.warning(f"Skipping {url} - no extractable content found after conversion")
|
||||||
filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
|
empty_files += 1
|
||||||
if not filename.endswith('.md'):
|
continue
|
||||||
filename = f"{filename}.md"
|
|
||||||
|
filename = create_unique_filename(url)
|
||||||
processed_files.append({
|
processed_files.append({
|
||||||
'content': markdown_content,
|
'content': markdown_content,
|
||||||
'content_type': 'text/markdown',
|
'content_type': 'text/markdown',
|
||||||
@@ -385,10 +607,38 @@ def main():
|
|||||||
'url': url
|
'url': url
|
||||||
})
|
})
|
||||||
|
|
||||||
logger.info(f"Processed {len(processed_files)} files")
|
logger.info(f"Processed {len(processed_files)} files, skipped {empty_files} empty files")
|
||||||
|
|
||||||
|
if not processed_files:
|
||||||
|
logger.error("No files with valid content were processed. Exiting.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
temp_dir = script_dir / "temp_webscraper_files"
|
||||||
|
temp_dir.mkdir(exist_ok=True)
|
||||||
|
logger.info(f"Created temporary directory: {temp_dir}")
|
||||||
|
|
||||||
|
saved_files = save_files_to_temp_dir(processed_files, temp_dir)
|
||||||
|
logger.info(f"Saved {len(saved_files)} files to temporary directory")
|
||||||
|
|
||||||
|
logger.info("=== DEBUG: File Content Analysis ===")
|
||||||
|
for file_info in saved_files:
|
||||||
|
try:
|
||||||
|
with open(file_info['file_path'], 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
lines = content.split('\n')
|
||||||
|
meaningful_lines = [line for line in lines if line.strip() and not line.startswith('#') and len(line.strip()) > 20]
|
||||||
|
|
||||||
|
logger.info(f"File: {file_info['filename']}")
|
||||||
|
logger.info(f" Total size: {len(content)} chars")
|
||||||
|
logger.info(f" Total lines: {len(lines)}")
|
||||||
|
logger.info(f" Meaningful lines: {len(meaningful_lines)}")
|
||||||
|
if meaningful_lines:
|
||||||
|
logger.info(f" First meaningful line: {meaningful_lines[0][:100]}{'...' if len(meaningful_lines[0]) > 100 else ''}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error reading saved file {file_info['filename']}: {e}")
|
||||||
|
logger.info("=== END DEBUG ===")
|
||||||
|
|
||||||
# 3. Upload to Open WebUI
|
|
||||||
# First check if a knowledge base with the specified name already exists
|
|
||||||
uploader = OpenWebUIUploader(args.base_url, args.token)
|
uploader = OpenWebUIUploader(args.base_url, args.token)
|
||||||
|
|
||||||
existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
|
existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
|
||||||
@@ -396,7 +646,6 @@ def main():
|
|||||||
kb_id = existing_kb.get('id')
|
kb_id = existing_kb.get('id')
|
||||||
logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
|
logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
|
||||||
else:
|
else:
|
||||||
# Create a new knowledge base if none exists with that name
|
|
||||||
logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
|
logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
|
||||||
kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
|
kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
|
||||||
kb_id = kb.get('id')
|
kb_id = kb.get('id')
|
||||||
@@ -405,61 +654,46 @@ def main():
|
|||||||
return 1
|
return 1
|
||||||
logger.info(f"Created knowledge base with ID: {kb_id}")
|
logger.info(f"Created knowledge base with ID: {kb_id}")
|
||||||
|
|
||||||
# 4. Upload each file
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
skip_count = 0
|
skip_count = 0
|
||||||
update_count = 0
|
update_count = 0
|
||||||
error_count = 0
|
error_count = 0
|
||||||
|
empty_skip_count = 0
|
||||||
|
|
||||||
for file_info in processed_files:
|
for file_info in saved_files:
|
||||||
try:
|
try:
|
||||||
filename = file_info['filename']
|
filename = file_info['filename']
|
||||||
|
file_path = file_info['file_path']
|
||||||
existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
|
existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
|
||||||
|
|
||||||
# Handle existing files based on options
|
if existing_file_id and args.skip_existing:
|
||||||
if existing_file_id:
|
logger.info(f"Skipping existing file: {filename}")
|
||||||
if args.skip_existing:
|
skip_count += 1
|
||||||
logger.info(f"Skipping existing file: {filename}")
|
continue
|
||||||
skip_count += 1
|
|
||||||
continue
|
logger.info(f"Uploading file: {filename}")
|
||||||
elif args.update:
|
result = uploader.upload_file_from_path(
|
||||||
logger.info(f"Updating existing file: {filename}")
|
kb_id,
|
||||||
uploader.update_file(
|
file_path,
|
||||||
kb_id,
|
filename,
|
||||||
existing_file_id,
|
file_info['content_type']
|
||||||
file_info['content'],
|
)
|
||||||
filename,
|
if isinstance(result, dict) and result.get('status') in ['skipped', 'error']:
|
||||||
file_info['content_type']
|
if result.get('status') == 'skipped':
|
||||||
)
|
empty_skip_count += 1
|
||||||
update_count += 1
|
|
||||||
else:
|
else:
|
||||||
# Default behavior: add as new file
|
error_count += 1
|
||||||
logger.info(f"Adding duplicate file (existing file will remain): {filename}")
|
logger.warning(f"Failed to upload {filename}: {result.get('reason')}")
|
||||||
uploader.upload_file(
|
|
||||||
kb_id,
|
|
||||||
file_info['content'],
|
|
||||||
filename,
|
|
||||||
file_info['content_type']
|
|
||||||
)
|
|
||||||
success_count += 1
|
|
||||||
else:
|
else:
|
||||||
# New file
|
|
||||||
logger.info(f"Uploading new file: {filename}")
|
|
||||||
uploader.upload_file(
|
|
||||||
kb_id,
|
|
||||||
file_info['content'],
|
|
||||||
filename,
|
|
||||||
file_info['content_type']
|
|
||||||
)
|
|
||||||
success_count += 1
|
success_count += 1
|
||||||
|
|
||||||
# Add a small delay between uploads
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to process {file_info['filename']}: {e}")
|
logger.error(f"Failed to process {file_info['filename']}: {e}")
|
||||||
error_count += 1
|
error_count += 1
|
||||||
|
|
||||||
logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors")
|
logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {empty_skip_count} empty/invalid files skipped, {error_count} errors")
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -467,11 +701,20 @@ def main():
|
|||||||
logger.error(f"An unexpected error occurred: {e}")
|
logger.error(f"An unexpected error occurred: {e}")
|
||||||
return 1
|
return 1
|
||||||
finally:
|
finally:
|
||||||
# Ensure all resources are properly closed
|
|
||||||
if scraper:
|
if scraper:
|
||||||
scraper.close()
|
scraper.close()
|
||||||
if uploader:
|
if uploader:
|
||||||
uploader.close()
|
uploader.close()
|
||||||
|
|
||||||
|
if temp_dir and temp_dir.exists():
|
||||||
|
if args.keep_temp_files:
|
||||||
|
logger.info(f"Keeping temporary files in: {temp_dir}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
logger.info("Cleaned up temporary directory")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to clean up temporary directory {temp_dir}: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
requests
|
requests
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
markitdown[all]
|
markitdown[all]
|
||||||
|
selenium
|
||||||
|
webdriver-manager
|
||||||
Reference in New Issue
Block a user