File size: 2,552 Bytes
40fe9ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# build_science_embeddings.py
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import trange
from sentence_transformers import SentenceTransformer

PARQUET_PATH = "scopus_corpus.parquet"
OUT_PATH = "scopus_corpus_with_specter.parquet"
BATCH = 64
DEVICE = "cpu"   # pon "cuda" si tienes GPU

# 1) Carga
table = pq.read_table(PARQUET_PATH)
df = table.to_pandas()

# 2) Texto para SPECTER: "Title [SEP] Abstract"
def row_text(row):
    title = str(row.get("title", "") or "")
    abstract = str(row.get("abstract", "") or "")
    if abstract.strip():
        return f"{title} [SEP] {abstract}"
    return title

texts = [row_text(r) for _, r in df.iterrows()]

# 3) Modelo SPECTER (SentenceTransformers)
specter = SentenceTransformer("allenai-specter", device=DEVICE)

# 4) Encode en batches
embs = []
for i in trange(0, len(texts), BATCH, desc="SPECTER"):
    batch = texts[i:i+BATCH]
    vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False)
    embs.append(vecs.astype("float32"))
specter_mat = np.vstack(embs).astype("float32")

# 5) Guardar como lista en DataFrame (compatible con tu pipeline)
df["specter_embedding"] = [v.tolist() for v in specter_mat]

# (Opcional) SciBERT embeddings rápidos (mean pooling)
# Solo si los quieres además de SPECTER — si no, comenta este bloque.
# from transformers import AutoTokenizer, AutoModel
# import torch
# tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
# mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE)
# mdl.eval()
# sciberts = []
# with torch.no_grad():
#     for i in trange(0, len(texts), BATCH, desc="SciBERT"):
#         batch = texts[i:i+BATCH]
#         enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE)
#         out = mdl(**enc).last_hidden_state  # [B, T, 768]
#         mask = enc.attention_mask.unsqueeze(-1)  # [B, T, 1]
#         summed = (out * mask).sum(dim=1)
#         counts = mask.sum(dim=1).clamp(min=1)
#         mean = summed / counts
#         # normaliza L2
#         mean = torch.nn.functional.normalize(mean, p=2, dim=1)
#         sciberts.append(mean.cpu().numpy().astype("float32"))
# scibert_mat = np.vstack(sciberts).astype("float32")
# df["scibert_embedding"] = [v.tolist() for v in scibert_mat]

# 6) Guardar Parquet
pq.write_table(pa.Table.from_pandas(df), OUT_PATH)
print("OK ->", OUT_PATH)