Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / crawl4ai /async_database.py

amaye15

test

03c0888 10 months ago

20.9 kB

	import os, sys
	from pathlib import Path
	import aiosqlite
	import asyncio
	from typing import Optional, Tuple, Dict
	from contextlib import asynccontextmanager
	import logging
	import json # Added for serialization/deserialization
	from .utils import ensure_content_dirs, generate_content_hash
	from .models import CrawlResult, MarkdownGenerationResult
	import xxhash
	import aiofiles
	from .config import NEED_MIGRATION
	from .version_manager import VersionManager
	from .async_logger import AsyncLogger
	from .utils import get_error_context, create_box_message
	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
	os.makedirs(DB_PATH, exist_ok=True)
	DB_PATH = os.path.join(base_directory, "crawl4ai.db")

	class AsyncDatabaseManager:
	def __init__(self, pool_size: int = 10, max_retries: int = 3):
	self.db_path = DB_PATH
	self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH))
	self.pool_size = pool_size
	self.max_retries = max_retries
	self.connection_pool: Dict[int, aiosqlite.Connection] = {}
	self.pool_lock = asyncio.Lock()
	self.init_lock = asyncio.Lock()
	self.connection_semaphore = asyncio.Semaphore(pool_size)
	self._initialized = False
	self.version_manager = VersionManager()
	self.logger = AsyncLogger(
	log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
	verbose=False,
	tag_width=10
	)


	async def initialize(self):
	"""Initialize the database and connection pool"""
	try:
	self.logger.info("Initializing database", tag="INIT")
	# Ensure the database file exists
	os.makedirs(os.path.dirname(self.db_path), exist_ok=True)

	# Check if version update is needed
	needs_update = self.version_manager.needs_update()

	# Always ensure base table exists
	await self.ainit_db()

	# Verify the table exists
	async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
	async with db.execute(
	"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
	) as cursor:
	result = await cursor.fetchone()
	if not result:
	raise Exception("crawled_data table was not created")

	# If version changed or fresh install, run updates
	if needs_update:
	self.logger.info("New version detected, running updates", tag="INIT")
	await self.update_db_schema()
	from .migrations import run_migration # Import here to avoid circular imports
	await run_migration()
	self.version_manager.update_version() # Update stored version after successful migration
	self.logger.success("Version update completed successfully", tag="COMPLETE")
	else:
	self.logger.success("Database initialization completed successfully", tag="COMPLETE")


	except Exception as e:
	self.logger.error(
	message="Database initialization error: {error}",
	tag="ERROR",
	params={"error": str(e)}
	)
	self.logger.info(
	message="Database will be initialized on first use",
	tag="INIT"
	)

	raise


	async def cleanup(self):
	"""Cleanup connections when shutting down"""
	async with self.pool_lock:
	for conn in self.connection_pool.values():
	await conn.close()
	self.connection_pool.clear()

	@asynccontextmanager
	async def get_connection(self):
	"""Connection pool manager with enhanced error handling"""
	if not self._initialized:
	async with self.init_lock:
	if not self._initialized:
	try:
	await self.initialize()
	self._initialized = True
	except Exception as e:
	import sys
	error_context = get_error_context(sys.exc_info())
	self.logger.error(
	message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
	tag="ERROR",
	force_verbose=True,
	params={
	"error": str(e),
	"context": error_context["code_context"],
	"traceback": error_context["full_traceback"]
	}
	)
	raise

	await self.connection_semaphore.acquire()
	task_id = id(asyncio.current_task())

	try:
	async with self.pool_lock:
	if task_id not in self.connection_pool:
	try:
	conn = await aiosqlite.connect(
	self.db_path,
	timeout=30.0
	)
	await conn.execute('PRAGMA journal_mode = WAL')
	await conn.execute('PRAGMA busy_timeout = 5000')

	# Verify database structure
	async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
	columns = await cursor.fetchall()
	column_names = [col[1] for col in columns]
	expected_columns = {
	'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
	'success', 'media', 'links', 'metadata', 'screenshot',
	'response_headers', 'downloaded_files'
	}
	missing_columns = expected_columns - set(column_names)
	if missing_columns:
	raise ValueError(f"Database missing columns: {missing_columns}")

	self.connection_pool[task_id] = conn
	except Exception as e:
	import sys
	error_context = get_error_context(sys.exc_info())
	error_message = (
	f"Unexpected error in db get_connection at line {error_context['line_no']} "
	f"in {error_context['function']} ({error_context['filename']}):\n"
	f"Error: {str(e)}\n\n"
	f"Code context:\n{error_context['code_context']}"
	)
	self.logger.error(
	message=create_box_message(error_message, type= "error"),
	)

	raise

	yield self.connection_pool[task_id]

	except Exception as e:
	import sys
	error_context = get_error_context(sys.exc_info())
	error_message = (
	f"Unexpected error in db get_connection at line {error_context['line_no']} "
	f"in {error_context['function']} ({error_context['filename']}):\n"
	f"Error: {str(e)}\n\n"
	f"Code context:\n{error_context['code_context']}"
	)
	self.logger.error(
	message=create_box_message(error_message, type= "error"),
	)
	raise
	finally:
	async with self.pool_lock:
	if task_id in self.connection_pool:
	await self.connection_pool[task_id].close()
	del self.connection_pool[task_id]
	self.connection_semaphore.release()


	async def execute_with_retry(self, operation, *args):
	"""Execute database operations with retry logic"""
	for attempt in range(self.max_retries):
	try:
	async with self.get_connection() as db:
	result = await operation(db, *args)
	await db.commit()
	return result
	except Exception as e:
	if attempt == self.max_retries - 1:
	self.logger.error(
	message="Operation failed after {retries} attempts: {error}",
	tag="ERROR",
	force_verbose=True,
	params={
	"retries": self.max_retries,
	"error": str(e)
	}
	)
	raise
	await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff

	async def ainit_db(self):
	"""Initialize database schema"""
	async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
	await db.execute('''
	CREATE TABLE IF NOT EXISTS crawled_data (
	url TEXT PRIMARY KEY,
	html TEXT,
	cleaned_html TEXT,
	markdown TEXT,
	extracted_content TEXT,
	success BOOLEAN,
	media TEXT DEFAULT "{}",
	links TEXT DEFAULT "{}",
	metadata TEXT DEFAULT "{}",
	screenshot TEXT DEFAULT "",
	response_headers TEXT DEFAULT "{}",
	downloaded_files TEXT DEFAULT "{}" -- New column added
	)
	''')
	await db.commit()



	async def update_db_schema(self):
	"""Update database schema if needed"""
	async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
	cursor = await db.execute("PRAGMA table_info(crawled_data)")
	columns = await cursor.fetchall()
	column_names = [column[1] for column in columns]

	# List of new columns to add
	new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']

	for column in new_columns:
	if column not in column_names:
	await self.aalter_db_add_column(column, db)
	await db.commit()

	async def aalter_db_add_column(self, new_column: str, db):
	"""Add new column to the database"""
	if new_column == 'response_headers':
	await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
	else:
	await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
	self.logger.info(
	message="Added column '{column}' to the database",
	tag="INIT",
	params={"column": new_column}
	)


	async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
	"""Retrieve cached URL data as CrawlResult"""
	async def _get(db):
	async with db.execute(
	'SELECT * FROM crawled_data WHERE url = ?', (url,)
	) as cursor:
	row = await cursor.fetchone()
	if not row:
	return None

	# Get column names
	columns = [description[0] for description in cursor.description]
	# Create dict from row data
	row_dict = dict(zip(columns, row))

	# Load content from files using stored hashes
	content_fields = {
	'html': row_dict['html'],
	'cleaned_html': row_dict['cleaned_html'],
	'markdown': row_dict['markdown'],
	'extracted_content': row_dict['extracted_content'],
	'screenshot': row_dict['screenshot'],
	'screenshots': row_dict['screenshot'],
	}

	for field, hash_value in content_fields.items():
	if hash_value:
	content = await self._load_content(
	hash_value,
	field.split('_')[0] # Get content type from field name
	)
	row_dict[field] = content or ""
	else:
	row_dict[field] = ""

	# Parse JSON fields
	json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
	for field in json_fields:
	try:
	row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
	except json.JSONDecodeError:
	row_dict[field] = {}

	if isinstance(row_dict['markdown'], Dict):
	row_dict['markdown_v2'] = row_dict['markdown']
	if row_dict['markdown'].get('raw_markdown'):
	row_dict['markdown'] = row_dict['markdown']['raw_markdown']

	# Parse downloaded_files
	try:
	row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
	except json.JSONDecodeError:
	row_dict['downloaded_files'] = []

	# Remove any fields not in CrawlResult model
	valid_fields = CrawlResult.__annotations__.keys()
	filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}

	return CrawlResult(**filtered_dict)

	try:
	return await self.execute_with_retry(_get)
	except Exception as e:
	self.logger.error(
	message="Error retrieving cached URL: {error}",
	tag="ERROR",
	force_verbose=True,
	params={"error": str(e)}
	)
	return None

	async def acache_url(self, result: CrawlResult):
	"""Cache CrawlResult data"""
	# Store content files and get hashes
	content_map = {
	'html': (result.html, 'html'),
	'cleaned_html': (result.cleaned_html or "", 'cleaned'),
	'markdown': None,
	'extracted_content': (result.extracted_content or "", 'extracted'),
	'screenshot': (result.screenshot or "", 'screenshots')
	}

	try:
	if isinstance(result.markdown, MarkdownGenerationResult):
	content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
	elif hasattr(result, 'markdown_v2'):
	content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
	elif isinstance(result.markdown, str):
	markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
	content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
	else:
	content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
	except Exception as e:
	self.logger.warning(
	message=f"Error processing markdown content: {str(e)}",
	tag="WARNING"
	)
	# Fallback to empty markdown result
	content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')

	content_hashes = {}
	for field, (content, content_type) in content_map.items():
	content_hashes[field] = await self._store_content(content, content_type)

	async def _cache(db):
	await db.execute('''
	INSERT INTO crawled_data (
	url, html, cleaned_html, markdown,
	extracted_content, success, media, links, metadata,
	screenshot, response_headers, downloaded_files
	)
	VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
	ON CONFLICT(url) DO UPDATE SET
	html = excluded.html,
	cleaned_html = excluded.cleaned_html,
	markdown = excluded.markdown,
	extracted_content = excluded.extracted_content,
	success = excluded.success,
	media = excluded.media,
	links = excluded.links,
	metadata = excluded.metadata,
	screenshot = excluded.screenshot,
	response_headers = excluded.response_headers,
	downloaded_files = excluded.downloaded_files
	''', (
	result.url,
	content_hashes['html'],
	content_hashes['cleaned_html'],
	content_hashes['markdown'],
	content_hashes['extracted_content'],
	result.success,
	json.dumps(result.media),
	json.dumps(result.links),
	json.dumps(result.metadata or {}),
	content_hashes['screenshot'],
	json.dumps(result.response_headers or {}),
	json.dumps(result.downloaded_files or [])
	))

	try:
	await self.execute_with_retry(_cache)
	except Exception as e:
	self.logger.error(
	message="Error caching URL: {error}",
	tag="ERROR",
	force_verbose=True,
	params={"error": str(e)}
	)


	async def aget_total_count(self) -> int:
	"""Get total number of cached URLs"""
	async def _count(db):
	async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
	result = await cursor.fetchone()
	return result[0] if result else 0

	try:
	return await self.execute_with_retry(_count)
	except Exception as e:
	self.logger.error(
	message="Error getting total count: {error}",
	tag="ERROR",
	force_verbose=True,
	params={"error": str(e)}
	)
	return 0

	async def aclear_db(self):
	"""Clear all data from the database"""
	async def _clear(db):
	await db.execute('DELETE FROM crawled_data')

	try:
	await self.execute_with_retry(_clear)
	except Exception as e:
	self.logger.error(
	message="Error clearing database: {error}",
	tag="ERROR",
	force_verbose=True,
	params={"error": str(e)}
	)

	async def aflush_db(self):
	"""Drop the entire table"""
	async def _flush(db):
	await db.execute('DROP TABLE IF EXISTS crawled_data')

	try:
	await self.execute_with_retry(_flush)
	except Exception as e:
	self.logger.error(
	message="Error flushing database: {error}",
	tag="ERROR",
	force_verbose=True,
	params={"error": str(e)}
	)


	async def _store_content(self, content: str, content_type: str) -> str:
	"""Store content in filesystem and return hash"""
	if not content:
	return ""

	content_hash = generate_content_hash(content)
	file_path = os.path.join(self.content_paths[content_type], content_hash)

	# Only write if file doesn't exist
	if not os.path.exists(file_path):
	async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
	await f.write(content)

	return content_hash

	async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
	"""Load content from filesystem by hash"""
	if not content_hash:
	return None

	file_path = os.path.join(self.content_paths[content_type], content_hash)
	try:
	async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
	return await f.read()
	except:
	self.logger.error(
	message="Failed to load content: {file_path}",
	tag="ERROR",
	force_verbose=True,
	params={"file_path": file_path}
	)
	return None

	# Create a singleton instance
	async_db_manager = AsyncDatabaseManager()