# build_science_embeddings.py import numpy as np import pandas as pd import pyarrow.parquet as pq import pyarrow as pa import pyarrow.parquet as pq from tqdm import trange from sentence_transformers import SentenceTransformer PARQUET_PATH = "scopus_corpus.parquet" OUT_PATH = "scopus_corpus_with_specter.parquet" BATCH = 64 DEVICE = "cpu" # pon "cuda" si tienes GPU # 1) Carga table = pq.read_table(PARQUET_PATH) df = table.to_pandas() # 2) Texto para SPECTER: "Title [SEP] Abstract" def row_text(row): title = str(row.get("title", "") or "") abstract = str(row.get("abstract", "") or "") if abstract.strip(): return f"{title} [SEP] {abstract}" return title texts = [row_text(r) for _, r in df.iterrows()] # 3) Modelo SPECTER (SentenceTransformers) specter = SentenceTransformer("allenai-specter", device=DEVICE) # 4) Encode en batches embs = [] for i in trange(0, len(texts), BATCH, desc="SPECTER"): batch = texts[i:i+BATCH] vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False) embs.append(vecs.astype("float32")) specter_mat = np.vstack(embs).astype("float32") # 5) Guardar como lista en DataFrame (compatible con tu pipeline) df["specter_embedding"] = [v.tolist() for v in specter_mat] # (Opcional) SciBERT embeddings rápidos (mean pooling) # Solo si los quieres además de SPECTER — si no, comenta este bloque. # from transformers import AutoTokenizer, AutoModel # import torch # tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased") # mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE) # mdl.eval() # sciberts = [] # with torch.no_grad(): # for i in trange(0, len(texts), BATCH, desc="SciBERT"): # batch = texts[i:i+BATCH] # enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE) # out = mdl(**enc).last_hidden_state # [B, T, 768] # mask = enc.attention_mask.unsqueeze(-1) # [B, T, 1] # summed = (out * mask).sum(dim=1) # counts = mask.sum(dim=1).clamp(min=1) # mean = summed / counts # # normaliza L2 # mean = torch.nn.functional.normalize(mean, p=2, dim=1) # sciberts.append(mean.cpu().numpy().astype("float32")) # scibert_mat = np.vstack(sciberts).astype("float32") # df["scibert_embedding"] = [v.tolist() for v in scibert_mat] # 6) Guardar Parquet pq.write_table(pa.Table.from_pandas(df), OUT_PATH) print("OK ->", OUT_PATH)