Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() # Load environment variables from .env file | |
| # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy | |
| DEFAULT_PROVIDER = "openai/gpt-4o-mini" | |
| MODEL_REPO_BRANCH = "new-release-0.0.2" | |
| # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy | |
| PROVIDER_MODELS = { | |
| "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token | |
| "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"), | |
| "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"), | |
| "openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"), | |
| "openai/gpt-4o": os.getenv("OPENAI_API_KEY"), | |
| "openai/o1-mini": os.getenv("OPENAI_API_KEY"), | |
| "openai/o1-preview": os.getenv("OPENAI_API_KEY"), | |
| "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), | |
| "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), | |
| "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), | |
| "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"), | |
| } | |
| # Chunk token threshold | |
| CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens | |
| OVERLAP_RATE = 0.1 | |
| WORD_TOKEN_RATE = 1.3 | |
| # Threshold for the minimum number of word in a HTML tag to be considered | |
| MIN_WORD_THRESHOLD = 1 | |
| IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1 | |
| IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height'] | |
| ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'] | |
| SOCIAL_MEDIA_DOMAINS = [ | |
| 'facebook.com', | |
| 'twitter.com', | |
| 'x.com', | |
| 'linkedin.com', | |
| 'instagram.com', | |
| 'pinterest.com', | |
| 'tiktok.com', | |
| 'snapchat.com', | |
| 'reddit.com', | |
| ] | |
| # Threshold for the Image extraction - Range is 1 to 6 | |
| # Images are scored based on point based system, to filter based on usefulness. Points are assigned | |
| # to each image based on the following aspects. | |
| # If either height or width exceeds 150px | |
| # If image size is greater than 10Kb | |
| # If alt property is set | |
| # If image format is in jpg, png or webp | |
| # If image is in the first half of the total images extracted from the page | |
| IMAGE_SCORE_THRESHOLD = 2 | |
| MAX_METRICS_HISTORY = 1000 | |
| NEED_MIGRATION = True | |
| URL_LOG_SHORTEN_LENGTH = 30 | |
| SHOW_DEPRECATION_WARNINGS = True | |
| SCREENSHOT_HEIGHT_TRESHOLD = 10000 | |
| PAGE_TIMEOUT=60000 | |
| DOWNLOAD_PAGE_TIMEOUT=60000 |