File size: 2,552 Bytes
40fe9ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# build_science_embeddings.py
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import trange
from sentence_transformers import SentenceTransformer
PARQUET_PATH = "scopus_corpus.parquet"
OUT_PATH = "scopus_corpus_with_specter.parquet"
BATCH = 64
DEVICE = "cpu" # pon "cuda" si tienes GPU
# 1) Carga
table = pq.read_table(PARQUET_PATH)
df = table.to_pandas()
# 2) Texto para SPECTER: "Title [SEP] Abstract"
def row_text(row):
title = str(row.get("title", "") or "")
abstract = str(row.get("abstract", "") or "")
if abstract.strip():
return f"{title} [SEP] {abstract}"
return title
texts = [row_text(r) for _, r in df.iterrows()]
# 3) Modelo SPECTER (SentenceTransformers)
specter = SentenceTransformer("allenai-specter", device=DEVICE)
# 4) Encode en batches
embs = []
for i in trange(0, len(texts), BATCH, desc="SPECTER"):
batch = texts[i:i+BATCH]
vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False)
embs.append(vecs.astype("float32"))
specter_mat = np.vstack(embs).astype("float32")
# 5) Guardar como lista en DataFrame (compatible con tu pipeline)
df["specter_embedding"] = [v.tolist() for v in specter_mat]
# (Opcional) SciBERT embeddings rápidos (mean pooling)
# Solo si los quieres además de SPECTER — si no, comenta este bloque.
# from transformers import AutoTokenizer, AutoModel
# import torch
# tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
# mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE)
# mdl.eval()
# sciberts = []
# with torch.no_grad():
# for i in trange(0, len(texts), BATCH, desc="SciBERT"):
# batch = texts[i:i+BATCH]
# enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE)
# out = mdl(**enc).last_hidden_state # [B, T, 768]
# mask = enc.attention_mask.unsqueeze(-1) # [B, T, 1]
# summed = (out * mask).sum(dim=1)
# counts = mask.sum(dim=1).clamp(min=1)
# mean = summed / counts
# # normaliza L2
# mean = torch.nn.functional.normalize(mean, p=2, dim=1)
# sciberts.append(mean.cpu().numpy().astype("float32"))
# scibert_mat = np.vstack(sciberts).astype("float32")
# df["scibert_embedding"] = [v.tolist() for v in scibert_mat]
# 6) Guardar Parquet
pq.write_table(pa.Table.from_pandas(df), OUT_PATH)
print("OK ->", OUT_PATH)
|