| # build_science_embeddings.py | |
| import numpy as np | |
| import pandas as pd | |
| import pyarrow.parquet as pq | |
| import pyarrow as pa | |
| import pyarrow.parquet as pq | |
| from tqdm import trange | |
| from sentence_transformers import SentenceTransformer | |
| PARQUET_PATH = "scopus_corpus.parquet" | |
| OUT_PATH = "scopus_corpus_with_specter.parquet" | |
| BATCH = 64 | |
| DEVICE = "cpu" # pon "cuda" si tienes GPU | |
| # 1) Carga | |
| table = pq.read_table(PARQUET_PATH) | |
| df = table.to_pandas() | |
| # 2) Texto para SPECTER: "Title [SEP] Abstract" | |
| def row_text(row): | |
| title = str(row.get("title", "") or "") | |
| abstract = str(row.get("abstract", "") or "") | |
| if abstract.strip(): | |
| return f"{title} [SEP] {abstract}" | |
| return title | |
| texts = [row_text(r) for _, r in df.iterrows()] | |
| # 3) Modelo SPECTER (SentenceTransformers) | |
| specter = SentenceTransformer("allenai-specter", device=DEVICE) | |
| # 4) Encode en batches | |
| embs = [] | |
| for i in trange(0, len(texts), BATCH, desc="SPECTER"): | |
| batch = texts[i:i+BATCH] | |
| vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False) | |
| embs.append(vecs.astype("float32")) | |
| specter_mat = np.vstack(embs).astype("float32") | |
| # 5) Guardar como lista en DataFrame (compatible con tu pipeline) | |
| df["specter_embedding"] = [v.tolist() for v in specter_mat] | |
| # (Opcional) SciBERT embeddings rápidos (mean pooling) | |
| # Solo si los quieres además de SPECTER — si no, comenta este bloque. | |
| # from transformers import AutoTokenizer, AutoModel | |
| # import torch | |
| # tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased") | |
| # mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE) | |
| # mdl.eval() | |
| # sciberts = [] | |
| # with torch.no_grad(): | |
| # for i in trange(0, len(texts), BATCH, desc="SciBERT"): | |
| # batch = texts[i:i+BATCH] | |
| # enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE) | |
| # out = mdl(**enc).last_hidden_state # [B, T, 768] | |
| # mask = enc.attention_mask.unsqueeze(-1) # [B, T, 1] | |
| # summed = (out * mask).sum(dim=1) | |
| # counts = mask.sum(dim=1).clamp(min=1) | |
| # mean = summed / counts | |
| # # normaliza L2 | |
| # mean = torch.nn.functional.normalize(mean, p=2, dim=1) | |
| # sciberts.append(mean.cpu().numpy().astype("float32")) | |
| # scibert_mat = np.vstack(sciberts).astype("float32") | |
| # df["scibert_embedding"] = [v.tolist() for v in scibert_mat] | |
| # 6) Guardar Parquet | |
| pq.write_table(pa.Table.from_pandas(df), OUT_PATH) | |
| print("OK ->", OUT_PATH) | |