Spaces:
Runtime error
Runtime error
| from fastapi import APIRouter, HTTPException, Query | |
| from fastapi.responses import JSONResponse | |
| import httpx | |
| import os | |
| import json | |
| import re | |
| from urllib.parse import unquote | |
| from PIL import Image | |
| import io | |
| import asyncio | |
| import struct | |
| from typing import Optional, Tuple, List, Dict | |
| import base64 | |
| from functools import lru_cache | |
| import aiofiles | |
| from concurrent.futures import ThreadPoolExecutor | |
| import time | |
| router = APIRouter() | |
| # Pool de threads otimizado para operações CPU-intensivas (thumbnail) | |
| thumbnail_executor = ThreadPoolExecutor( | |
| max_workers=min(32, (os.cpu_count() or 1) + 4), | |
| thread_name_prefix="thumbnail_" | |
| ) | |
| # Cache em memória para URLs já processadas | |
| _url_cache = {} | |
| _cache_max_size = 1000 | |
| async def search( | |
| q: str = Query(..., description="Termo de pesquisa para imagens"), | |
| min_width: int = Query(1200, description="Largura mínima das imagens (padrão: 1200px)"), | |
| include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas") | |
| ): | |
| """ | |
| Busca imagens no Google Imagens com máxima performance | |
| """ | |
| start_time = time.time() | |
| # URL do Google Imagens com parâmetros para imagens grandes | |
| google_images_url = "http://www.google.com/search" | |
| params = { | |
| "tbm": "isch", | |
| "q": q, | |
| "start": 0, | |
| "sa": "N", | |
| "asearch": "arc", | |
| "cs": "1", | |
| "tbs": "isz:l", | |
| "async": f"arc_id:srp_GgSMaOPQOtL_5OUPvbSTOQ_110,ffilt:all,ve_name:MoreResultsContainer,inf:1,_id:arc-srp_GgSMaOPQOtL_5OUPvbSTOQ_110,_pms:s,_fmt:pc" | |
| } | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", | |
| "Accept-Encoding": "gzip, deflate", | |
| "Connection": "keep-alive", | |
| "Referer": "https://www.google.com/" | |
| } | |
| try: | |
| # Busca no Google (rápida) | |
| async with httpx.AsyncClient(timeout=30.0) as client: | |
| response = await client.get(google_images_url, params=params, headers=headers) | |
| if response.status_code != 200: | |
| raise HTTPException(status_code=response.status_code, detail="Erro ao buscar no Google Imagens") | |
| print(f"Google respondeu em {time.time() - start_time:.2f}s") | |
| extract_start = time.time() | |
| # Extração otimizada | |
| images = extract_images_from_response_optimized(response.text) | |
| print(f"Extração concluída em {time.time() - extract_start:.2f}s - {len(images)} URLs") | |
| # Processamento paralelo massivo | |
| processing_start = time.time() | |
| enriched_images = await enrich_images_ultra_fast(images, include_thumbnails) | |
| print(f"Processamento concluído em {time.time() - processing_start:.2f}s") | |
| # Filtragem rápida | |
| valid_images = [ | |
| img for img in enriched_images | |
| if img.get('width', 0) >= min_width and img.get('height', 0) > 0 | |
| ] | |
| # Se poucos resultados, busca adicional em paralelo | |
| if len(valid_images) < 20: | |
| params["tbs"] = "isz:lt,islt:4mp" | |
| async with httpx.AsyncClient(timeout=30.0) as client: | |
| response2 = await client.get(google_images_url, params=params, headers=headers) | |
| if response2.status_code == 200: | |
| additional_images = extract_images_from_response_optimized(response2.text) | |
| additional_enriched = await enrich_images_ultra_fast(additional_images, include_thumbnails) | |
| # Merge rápido com set para deduplicação | |
| seen_urls = {img.get('url') for img in valid_images} | |
| for img in additional_enriched: | |
| if (img.get('url') not in seen_urls | |
| and img.get('width', 0) >= min_width | |
| and img.get('height', 0) > 0): | |
| valid_images.append(img) | |
| seen_urls.add(img.get('url')) | |
| # Ordenação e limitação | |
| valid_images.sort(key=lambda x: x.get('width', 0), reverse=True) | |
| final_images = valid_images[:50] | |
| total_time = time.time() - start_time | |
| print(f"TEMPO TOTAL: {total_time:.2f}s - {len(final_images)} imagens finais") | |
| return JSONResponse(content={ | |
| "query": q, | |
| "min_width_filter": min_width, | |
| "total_found": len(final_images), | |
| "thumbnails_included": include_thumbnails, | |
| "processing_time": round(total_time, 2), | |
| "images": final_images | |
| }) | |
| except httpx.TimeoutException: | |
| raise HTTPException(status_code=408, detail="Timeout na requisição ao Google") | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}") | |
| def clean_wikimedia_url_cached(url: str) -> str: | |
| """ | |
| Versão cached da limpeza de URLs do Wikimedia | |
| """ | |
| if 'wikimedia.org' in url and '/thumb/' in url: | |
| try: | |
| parts = url.split('/thumb/') | |
| if len(parts) == 2: | |
| before_thumb = parts[0] | |
| after_thumb = parts[1] | |
| path_parts = after_thumb.split('/') | |
| if len(path_parts) >= 3: | |
| original_path = '/'.join(path_parts[:3]) | |
| return f"{before_thumb}/{original_path}" | |
| except: | |
| pass | |
| return url | |
| def extract_images_from_response_optimized(response_text: str) -> List[Dict]: | |
| """ | |
| Extração ultra-otimizada usando regex compilado e processamento em lote | |
| """ | |
| # Regex compilado (mais rápido) | |
| pattern = re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)\b', re.IGNORECASE) | |
| # Extração em uma única passada | |
| image_urls = pattern.findall(response_text) | |
| # Deduplicação com set (O(1) lookup) | |
| seen_urls = set() | |
| images = [] | |
| # Processa URLs em lote | |
| for url in image_urls[:200]: # Aumentado para compensar filtragem | |
| cleaned_url = clean_wikimedia_url_cached(url) | |
| if cleaned_url not in seen_urls: | |
| seen_urls.add(cleaned_url) | |
| images.append({"url": cleaned_url, "width": None, "height": None}) | |
| return images | |
| def get_image_size_super_fast(data: bytes) -> Optional[Tuple[int, int]]: | |
| """ | |
| Parsing ultra-otimizado - apenas formatos mais comuns primeiro | |
| """ | |
| if len(data) < 24: | |
| return None | |
| try: | |
| # JPEG (mais comum) - otimizado | |
| if data[:2] == b'\xff\xd8': | |
| # Busca mais eficiente pelos markers | |
| for i in range(2, min(len(data) - 8, 1000)): # Limita busca | |
| if data[i:i+2] in (b'\xff\xc0', b'\xff\xc2'): | |
| if i + 9 <= len(data): | |
| height = struct.unpack('>H', data[i+5:i+7])[0] | |
| width = struct.unpack('>H', data[i+7:i+9])[0] | |
| if width > 0 and height > 0: | |
| return width, height | |
| # PNG (segundo mais comum) | |
| elif data[:8] == b'\x89PNG\r\n\x1a\n' and len(data) >= 24: | |
| width = struct.unpack('>I', data[16:20])[0] | |
| height = struct.unpack('>I', data[20:24])[0] | |
| if width > 0 and height > 0: | |
| return width, height | |
| # WebP (crescimento) | |
| elif data[:12] == b'RIFF' + data[4:8] + b'WEBP' and len(data) >= 30: | |
| if data[12:16] == b'VP8 ': | |
| width = struct.unpack('<H', data[26:28])[0] & 0x3fff | |
| height = struct.unpack('<H', data[28:30])[0] & 0x3fff | |
| if width > 0 and height > 0: | |
| return width, height | |
| except: | |
| pass | |
| return None | |
| def create_thumbnail_cpu_optimized(image_data: bytes, max_size: int = 200) -> Optional[str]: | |
| """ | |
| Versão CPU-otimizada para threading | |
| """ | |
| if not image_data or len(image_data) < 100: | |
| return None | |
| try: | |
| # Abre imagem (rápido) | |
| with Image.open(io.BytesIO(image_data)) as image: | |
| # Conversão rápida para RGB | |
| if image.mode != 'RGB': | |
| if image.mode in ('RGBA', 'LA'): | |
| # Background branco para transparências | |
| bg = Image.new('RGB', image.size, (255, 255, 255)) | |
| bg.paste(image, mask=image.split()[-1] if 'A' in image.mode else None) | |
| image = bg | |
| else: | |
| image = image.convert('RGB') | |
| # Cálculo otimizado de proporções | |
| w, h = image.size | |
| if w > h: | |
| new_w, new_h = max_size, max(1, (h * max_size) // w) | |
| else: | |
| new_w, new_h = max(1, (w * max_size) // h), max_size | |
| # Resize com filtro mais rápido para thumbnails | |
| thumbnail = image.resize((new_w, new_h), Image.Resampling.BILINEAR) | |
| # Salva com configurações otimizadas | |
| buffer = io.BytesIO() | |
| thumbnail.save(buffer, format='JPEG', quality=80, optimize=False) # optimize=False é mais rápido | |
| return f"data:image/jpeg;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}" | |
| except Exception as e: | |
| return None | |
| async def download_and_process_image(session: httpx.AsyncClient, url: str, include_thumbnail: bool) -> Dict: | |
| """ | |
| Download e processamento otimizado de uma única imagem | |
| """ | |
| # Verifica cache primeiro | |
| cache_key = f"{url}_{include_thumbnail}" | |
| if cache_key in _url_cache: | |
| return _url_cache[cache_key].copy() | |
| clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/') | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'image/*', | |
| 'Connection': 'close' | |
| } | |
| width, height, thumbnail_b64 = None, None, None | |
| try: | |
| # Estratégia otimizada: tamanhos incrementais | |
| ranges = ['0-8192', '0-32768', '0-131072'] if include_thumbnail else ['0-2048'] | |
| for range_header in ranges: | |
| headers['Range'] = f'bytes={range_header}' | |
| try: | |
| response = await session.get(clean_url, headers=headers, timeout=6.0) | |
| if response.status_code in [200, 206] and len(response.content) > 100: | |
| # Parsing rápido de dimensões | |
| if not width or not height: | |
| dimensions = get_image_size_super_fast(response.content) | |
| if dimensions: | |
| width, height = dimensions | |
| # Thumbnail em thread separada se necessário | |
| if include_thumbnail and not thumbnail_b64: | |
| loop = asyncio.get_event_loop() | |
| thumbnail_b64 = await loop.run_in_executor( | |
| thumbnail_executor, | |
| create_thumbnail_cpu_optimized, | |
| response.content | |
| ) | |
| # Se conseguiu tudo o que precisava, para por aqui | |
| if width and height and (not include_thumbnail or thumbnail_b64): | |
| break | |
| except: | |
| continue # Tenta próximo range | |
| # Fallback final: download completo se necessário | |
| if (not width or not height or (include_thumbnail and not thumbnail_b64)): | |
| try: | |
| del headers['Range'] | |
| response = await session.get(clean_url, headers=headers, timeout=8.0) | |
| if response.status_code == 200 and len(response.content) < 2000000: # Max 2MB | |
| if not width or not height: | |
| try: | |
| with Image.open(io.BytesIO(response.content)) as img: | |
| width, height = img.size | |
| except: | |
| pass | |
| if include_thumbnail and not thumbnail_b64: | |
| loop = asyncio.get_event_loop() | |
| thumbnail_b64 = await loop.run_in_executor( | |
| thumbnail_executor, | |
| create_thumbnail_cpu_optimized, | |
| response.content | |
| ) | |
| except: | |
| pass | |
| except Exception as e: | |
| pass | |
| result = { | |
| "url": clean_url, | |
| "width": width, | |
| "height": height | |
| } | |
| if include_thumbnail: | |
| result["thumbnail"] = thumbnail_b64 | |
| # Cache do resultado (limita tamanho do cache) | |
| if len(_url_cache) < _cache_max_size: | |
| _url_cache[cache_key] = result.copy() | |
| return result | |
| async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool = True) -> List[Dict]: | |
| """ | |
| Processamento ultra-paralelo com todas as otimizações modernas | |
| """ | |
| if not images: | |
| return [] | |
| # Configuração HTTP2 otimizada para máxima concorrência | |
| connector = httpx.AsyncClient( | |
| timeout=httpx.Timeout(10.0), | |
| limits=httpx.Limits( | |
| max_keepalive_connections=100, # Muito mais conexões | |
| max_connections=150, # Pool maior | |
| keepalive_expiry=30.0 # Mantém conexões por mais tempo | |
| ), | |
| http2=False # HTTP/1.1 ainda é mais rápido para muitas conexões pequenas | |
| ) | |
| # Semáforo mais agressivo | |
| semaphore = asyncio.Semaphore(30) # Muito mais concorrência | |
| async def process_single_image(image_data): | |
| async with semaphore: | |
| return await download_and_process_image(connector, image_data["url"], include_thumbnails) | |
| try: | |
| print(f"Iniciando processamento ultra-paralelo de {len(images)} imagens...") | |
| # Cria todas as tasks de uma vez | |
| tasks = [process_single_image(img) for img in images] | |
| # Processa tudo em paralelo com gather otimizado | |
| results = await asyncio.gather(*tasks, return_exceptions=True) | |
| # Filtragem rápida | |
| valid_results = [] | |
| for result in results: | |
| if not isinstance(result, Exception) and result.get('width') and result.get('height'): | |
| valid_results.append(result) | |
| success_rate = len(valid_results) / len(images) * 100 | |
| print(f"Processamento concluído: {len(valid_results)}/{len(images)} ({success_rate:.1f}% sucesso)") | |
| return valid_results | |
| except Exception as e: | |
| print(f"Erro no processamento ultra-rápido: {e}") | |
| return [] | |
| finally: | |
| await connector.aclose() | |
| # Endpoint adicional otimizado | |
| async def get_thumbnail_fast( | |
| url: str = Query(..., description="URL da imagem para gerar miniatura"), | |
| size: int = Query(200, description="Tamanho máximo da miniatura em pixels") | |
| ): | |
| """ | |
| Obtém miniatura ultra-rápida de uma imagem específica | |
| """ | |
| try: | |
| async with httpx.AsyncClient(timeout=8.0) as client: | |
| result = await download_and_process_image(client, url, True) | |
| if result.get('thumbnail'): | |
| return JSONResponse(content={ | |
| "url": result['url'], | |
| "thumbnail": result['thumbnail'], | |
| "dimensions": f"{result.get('width', 0)}x{result.get('height', 0)}", | |
| "size": size | |
| }) | |
| else: | |
| raise HTTPException(status_code=500, detail="Erro ao criar miniatura") | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Erro: {str(e)}") | |
| # Cleanup do executor na finalização | |
| import atexit | |
| atexit.register(lambda: thumbnail_executor.shutdown(wait=False)) |