Upload 6 files
Browse files- build_parquet_embeddings.py +77 -0
- build_science_embeddings.py +68 -0
- journal_recommender_app.py +548 -0
- scopus.py +418 -0
- scopus_corpus.parquet +3 -0
- scopus_corpus_with_specter.parquet +3 -0
build_parquet_embeddings.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pyarrow as pa
|
| 5 |
+
import pyarrow.parquet as pq
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def ensure_col(df: pd.DataFrame, name: str):
|
| 11 |
+
if name not in df.columns:
|
| 12 |
+
df[name] = ""
|
| 13 |
+
return df
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
p = argparse.ArgumentParser(description="Construye un Parquet con embeddings E5 para recomendación de revistas.")
|
| 17 |
+
p.add_argument("--csv", required=True, help="Ruta al CSV exportado (ej. uptc_afid60077378_scopus_export.csv)")
|
| 18 |
+
p.add_argument("--out", default="scopus_corpus.parquet", help="Ruta de salida Parquet")
|
| 19 |
+
p.add_argument("--model", default="intfloat/multilingual-e5-small", help="Modelo Sentence-Transformers")
|
| 20 |
+
p.add_argument("--batch-size", type=int, default=64, help="Tamaño de batch para el encode")
|
| 21 |
+
args = p.parse_args()
|
| 22 |
+
|
| 23 |
+
df = pd.read_csv(args.csv)
|
| 24 |
+
|
| 25 |
+
# Asegurar columnas mínimas del export "simple"
|
| 26 |
+
for c in ["Title","Source title","ISSN","eISSN","Year","Cited by","DOI","Link","EID","Document Type","Open Access"]:
|
| 27 |
+
ensure_col(df, c)
|
| 28 |
+
|
| 29 |
+
# Texto para similitud: funciona aunque no haya Abstract/Keywords
|
| 30 |
+
# Usamos título + (revista como contexto suave)
|
| 31 |
+
df["text_for_match"] = (
|
| 32 |
+
df["Title"].fillna("").astype(str).str.strip()
|
| 33 |
+
+ ". Revista: "
|
| 34 |
+
+ df["Source title"].fillna("").astype(str).str.strip()
|
| 35 |
+
).str.replace(r"\s+", " ", regex=True).str.strip()
|
| 36 |
+
|
| 37 |
+
# Cargar modelo
|
| 38 |
+
print(f"Cargando modelo: {args.model}")
|
| 39 |
+
model = SentenceTransformer(args.model, device="cpu")
|
| 40 |
+
|
| 41 |
+
# Prefijo E5: "passage: " para el corpus
|
| 42 |
+
texts = ["passage: " + t if t else "passage: " for t in df["text_for_match"].tolist()]
|
| 43 |
+
print(f"Codificando {len(texts)} textos…")
|
| 44 |
+
embs = model.encode(
|
| 45 |
+
texts,
|
| 46 |
+
batch_size=args.batch_size,
|
| 47 |
+
show_progress_bar=True,
|
| 48 |
+
normalize_embeddings=True, # importantes para coseno con producto punto
|
| 49 |
+
).astype(np.float32)
|
| 50 |
+
|
| 51 |
+
# Normalizaciones de tipos
|
| 52 |
+
year = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
|
| 53 |
+
cited = pd.to_numeric(df["Cited by"], errors="coerce").fillna(0).astype(np.int32)
|
| 54 |
+
|
| 55 |
+
# Construcción de la tabla Arrow
|
| 56 |
+
table = pa.table({
|
| 57 |
+
"eid": pa.array(df["EID"].astype(str).tolist()),
|
| 58 |
+
"title": pa.array(df["Title"].astype(str).tolist()),
|
| 59 |
+
"source_title": pa.array(df["Source title"].astype(str).tolist()),
|
| 60 |
+
"issn": pa.array(df["ISSN"].fillna("").astype(str).tolist()),
|
| 61 |
+
"eissn": pa.array(df["eISSN"].fillna("").astype(str).tolist()),
|
| 62 |
+
"year": pa.array(year.tolist(), type=pa.int64()),
|
| 63 |
+
"cited_by": pa.array(cited.tolist(), type=pa.int32()),
|
| 64 |
+
"doi": pa.array(df["DOI"].fillna("").astype(str).tolist()),
|
| 65 |
+
"link": pa.array(df["Link"].fillna("").astype(str).tolist()),
|
| 66 |
+
"Document Type": pa.array(df["Document Type"].astype(str).tolist()),
|
| 67 |
+
"Open Access": pa.array(df["Open Access"].astype(str).tolist()),
|
| 68 |
+
"text_for_match": pa.array(df["text_for_match"].tolist()),
|
| 69 |
+
"embedding": pa.array(embs.tolist(), type=pa.list_(pa.float32())),
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
pq.write_table(table, args.out, compression="zstd")
|
| 73 |
+
dim = len(embs[0]) if len(embs) else 0
|
| 74 |
+
print(f"OK -> {args.out} | filas: {table.num_rows} | dim: {dim}")
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
main()
|
build_science_embeddings.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# build_science_embeddings.py
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pyarrow.parquet as pq
|
| 5 |
+
import pyarrow as pa
|
| 6 |
+
import pyarrow.parquet as pq
|
| 7 |
+
from tqdm import trange
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
|
| 10 |
+
PARQUET_PATH = "scopus_corpus.parquet"
|
| 11 |
+
OUT_PATH = "scopus_corpus_with_specter.parquet"
|
| 12 |
+
BATCH = 64
|
| 13 |
+
DEVICE = "cpu" # pon "cuda" si tienes GPU
|
| 14 |
+
|
| 15 |
+
# 1) Carga
|
| 16 |
+
table = pq.read_table(PARQUET_PATH)
|
| 17 |
+
df = table.to_pandas()
|
| 18 |
+
|
| 19 |
+
# 2) Texto para SPECTER: "Title [SEP] Abstract"
|
| 20 |
+
def row_text(row):
|
| 21 |
+
title = str(row.get("title", "") or "")
|
| 22 |
+
abstract = str(row.get("abstract", "") or "")
|
| 23 |
+
if abstract.strip():
|
| 24 |
+
return f"{title} [SEP] {abstract}"
|
| 25 |
+
return title
|
| 26 |
+
|
| 27 |
+
texts = [row_text(r) for _, r in df.iterrows()]
|
| 28 |
+
|
| 29 |
+
# 3) Modelo SPECTER (SentenceTransformers)
|
| 30 |
+
specter = SentenceTransformer("allenai-specter", device=DEVICE)
|
| 31 |
+
|
| 32 |
+
# 4) Encode en batches
|
| 33 |
+
embs = []
|
| 34 |
+
for i in trange(0, len(texts), BATCH, desc="SPECTER"):
|
| 35 |
+
batch = texts[i:i+BATCH]
|
| 36 |
+
vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False)
|
| 37 |
+
embs.append(vecs.astype("float32"))
|
| 38 |
+
specter_mat = np.vstack(embs).astype("float32")
|
| 39 |
+
|
| 40 |
+
# 5) Guardar como lista en DataFrame (compatible con tu pipeline)
|
| 41 |
+
df["specter_embedding"] = [v.tolist() for v in specter_mat]
|
| 42 |
+
|
| 43 |
+
# (Opcional) SciBERT embeddings rápidos (mean pooling)
|
| 44 |
+
# Solo si los quieres además de SPECTER — si no, comenta este bloque.
|
| 45 |
+
# from transformers import AutoTokenizer, AutoModel
|
| 46 |
+
# import torch
|
| 47 |
+
# tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
|
| 48 |
+
# mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE)
|
| 49 |
+
# mdl.eval()
|
| 50 |
+
# sciberts = []
|
| 51 |
+
# with torch.no_grad():
|
| 52 |
+
# for i in trange(0, len(texts), BATCH, desc="SciBERT"):
|
| 53 |
+
# batch = texts[i:i+BATCH]
|
| 54 |
+
# enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE)
|
| 55 |
+
# out = mdl(**enc).last_hidden_state # [B, T, 768]
|
| 56 |
+
# mask = enc.attention_mask.unsqueeze(-1) # [B, T, 1]
|
| 57 |
+
# summed = (out * mask).sum(dim=1)
|
| 58 |
+
# counts = mask.sum(dim=1).clamp(min=1)
|
| 59 |
+
# mean = summed / counts
|
| 60 |
+
# # normaliza L2
|
| 61 |
+
# mean = torch.nn.functional.normalize(mean, p=2, dim=1)
|
| 62 |
+
# sciberts.append(mean.cpu().numpy().astype("float32"))
|
| 63 |
+
# scibert_mat = np.vstack(sciberts).astype("float32")
|
| 64 |
+
# df["scibert_embedding"] = [v.tolist() for v in scibert_mat]
|
| 65 |
+
|
| 66 |
+
# 6) Guardar Parquet
|
| 67 |
+
pq.write_table(pa.Table.from_pandas(df), OUT_PATH)
|
| 68 |
+
print("OK ->", OUT_PATH)
|
journal_recommender_app.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import pyarrow.parquet as pq
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import io, os, tempfile, base64, json
|
| 7 |
+
from string import Template
|
| 8 |
+
import networkx as nx
|
| 9 |
+
from networkx.algorithms.community import greedy_modularity_communities
|
| 10 |
+
|
| 11 |
+
# =========================
|
| 12 |
+
# Config
|
| 13 |
+
# =========================
|
| 14 |
+
PARQUET_PATH = "scopus_corpus.parquet" # usa el parquet enriquecido si ya generaste SPECTER
|
| 15 |
+
MODEL_NAME_E5 = "intfloat/multilingual-e5-small" # recuperador rápido
|
| 16 |
+
MODEL_NAME_SPECTER = "allenai-specter" # embeddings científicos
|
| 17 |
+
qry_prefix = "query: "
|
| 18 |
+
|
| 19 |
+
# =========================
|
| 20 |
+
# Carga dataset
|
| 21 |
+
# =========================
|
| 22 |
+
table = pq.read_table(PARQUET_PATH)
|
| 23 |
+
df = table.to_pandas()
|
| 24 |
+
|
| 25 |
+
# Embeddings E5 (documentos) normalizados
|
| 26 |
+
embeddings = np.vstack(df["embedding"].to_list()).astype("float32")
|
| 27 |
+
|
| 28 |
+
# Embeddings SPECTER (documentos), si existen
|
| 29 |
+
specter_embs = None
|
| 30 |
+
if "specter_embedding" in df.columns:
|
| 31 |
+
specter_embs = np.vstack(df["specter_embedding"].to_list()).astype("float32")
|
| 32 |
+
SPECTER_AVAILABLE = specter_embs is not None
|
| 33 |
+
|
| 34 |
+
# =========================
|
| 35 |
+
# Modelos (E5 fijo, SPECTER lazy)
|
| 36 |
+
# =========================
|
| 37 |
+
model_e5 = SentenceTransformer(MODEL_NAME_E5, device="cpu")
|
| 38 |
+
_model_specter = None
|
| 39 |
+
|
| 40 |
+
def get_specter():
|
| 41 |
+
global _model_specter
|
| 42 |
+
if _model_specter is None:
|
| 43 |
+
_model_specter = SentenceTransformer(MODEL_NAME_SPECTER, device="cpu")
|
| 44 |
+
return _model_specter
|
| 45 |
+
|
| 46 |
+
# =========================
|
| 47 |
+
# Recomendación (tabla)
|
| 48 |
+
# =========================
|
| 49 |
+
def recommend(query: str,
|
| 50 |
+
k_articles: int = 300,
|
| 51 |
+
top_n: int = 10,
|
| 52 |
+
min_year: str = "",
|
| 53 |
+
max_year: str = "",
|
| 54 |
+
use_specter: bool = False,
|
| 55 |
+
alpha_e5: float = 0.6):
|
| 56 |
+
|
| 57 |
+
query = (query or "").strip()
|
| 58 |
+
if len(query) < 5:
|
| 59 |
+
return pd.DataFrame({"Mensaje": ["Escribe un título o idea más descriptiva (≥ 5 caracteres)."]})
|
| 60 |
+
|
| 61 |
+
# Filtro de años (opcional)
|
| 62 |
+
sub_df = df
|
| 63 |
+
if min_year.strip() or max_year.strip():
|
| 64 |
+
try:
|
| 65 |
+
y0 = int(min_year) if min_year.strip() else None
|
| 66 |
+
y1 = int(max_year) if max_year.strip() else None
|
| 67 |
+
except ValueError:
|
| 68 |
+
y0 = y1 = None
|
| 69 |
+
if y0 is not None:
|
| 70 |
+
sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
|
| 71 |
+
if y1 is not None:
|
| 72 |
+
sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
|
| 73 |
+
if sub_df.empty:
|
| 74 |
+
return pd.DataFrame({"Mensaje": ["No hay artículos en el rango de años solicitado."]})
|
| 75 |
+
|
| 76 |
+
sub_idx = sub_df.index.to_numpy()
|
| 77 |
+
sub_e5 = embeddings[sub_idx]
|
| 78 |
+
|
| 79 |
+
# Embedding de la consulta
|
| 80 |
+
q_e5 = model_e5.encode([qry_prefix + query], normalize_embeddings=True)[0].astype("float32")
|
| 81 |
+
sims_e5 = sub_e5 @ q_e5
|
| 82 |
+
|
| 83 |
+
sims = sims_e5
|
| 84 |
+
if use_specter and specter_embs is not None:
|
| 85 |
+
# Mezcla con SPECTER
|
| 86 |
+
spc = specter_embs[sub_idx]
|
| 87 |
+
q_spc = get_specter().encode([query], normalize_embeddings=True)[0].astype("float32")
|
| 88 |
+
sims_spc = spc @ q_spc
|
| 89 |
+
alpha = float(alpha_e5)
|
| 90 |
+
sims = alpha * sims_e5 + (1 - alpha) * sims_spc
|
| 91 |
+
|
| 92 |
+
# Top-k artículos similares
|
| 93 |
+
k = min(int(k_articles), len(sub_idx))
|
| 94 |
+
if k <= 0:
|
| 95 |
+
return pd.DataFrame({"Mensaje": ["No hay artículos para comparar."]})
|
| 96 |
+
|
| 97 |
+
top_k_idx_local = np.argpartition(-sims, k - 1)[:k]
|
| 98 |
+
top_rows = sub_df.iloc[top_k_idx_local].copy()
|
| 99 |
+
top_rows["sim"] = sims[top_k_idx_local]
|
| 100 |
+
|
| 101 |
+
# Agregar por revista
|
| 102 |
+
grp_cols = ["source_title", "issn", "eissn"]
|
| 103 |
+
best_idx = (top_rows.groupby(grp_cols)["sim"].idxmax())
|
| 104 |
+
|
| 105 |
+
agg = (top_rows.groupby(grp_cols)
|
| 106 |
+
.agg(score=("sim", "mean"),
|
| 107 |
+
best=("sim", "max"),
|
| 108 |
+
n=("sim", "size"))
|
| 109 |
+
.reset_index())
|
| 110 |
+
|
| 111 |
+
# Extra info (si existe)
|
| 112 |
+
extra_cols = ["title", "doi", "link", "year", "Document Type", "Open Access"]
|
| 113 |
+
extra_cols_present = [c for c in extra_cols if c in top_rows.columns]
|
| 114 |
+
best_titles = top_rows.loc[best_idx, grp_cols + extra_cols_present].set_index(grp_cols)
|
| 115 |
+
agg = agg.merge(best_titles, left_on=grp_cols, right_index=True, how="left")
|
| 116 |
+
|
| 117 |
+
# Ranking híbrido
|
| 118 |
+
agg["rank"] = agg["score"] * 0.8 + agg["best"] * 0.2 + np.log1p(agg["n"]) * 0.02
|
| 119 |
+
|
| 120 |
+
out = (
|
| 121 |
+
agg.sort_values("rank", ascending=False)
|
| 122 |
+
.head(int(top_n))
|
| 123 |
+
.rename(columns={
|
| 124 |
+
"source_title": "Revista",
|
| 125 |
+
"issn": "ISSN",
|
| 126 |
+
"eissn": "eISSN",
|
| 127 |
+
"n": "#similitudes",
|
| 128 |
+
"year": "Año",
|
| 129 |
+
"score": "Score medio",
|
| 130 |
+
"best": "Mejor similitud",
|
| 131 |
+
"title": "Título del artículo",
|
| 132 |
+
"doi": "DOI",
|
| 133 |
+
"link": "Link",
|
| 134 |
+
"document type": "Document Type",
|
| 135 |
+
"open access": "Open Access"
|
| 136 |
+
})
|
| 137 |
+
)
|
| 138 |
+
if "Año" in out.columns:
|
| 139 |
+
out["Año"] = out["Año"].fillna(0).astype(int).replace(0, "")
|
| 140 |
+
cols = ["Revista","Año","ISSN","eISSN","#similitudes","Score medio","Mejor similitud",
|
| 141 |
+
"Título del artículo","DOI","Link","Document Type","Open Access"]
|
| 142 |
+
out = out[[c for c in cols if c in out.columns]]
|
| 143 |
+
if "Score medio" in out.columns:
|
| 144 |
+
out["Score medio"] = out["Score medio"].round(3)
|
| 145 |
+
if "Mejor similitud" in out.columns:
|
| 146 |
+
out["Mejor similitud"] = out["Mejor similitud"].round(3)
|
| 147 |
+
return out
|
| 148 |
+
|
| 149 |
+
# =========================
|
| 150 |
+
# Grafo interactivo (vis-network en iframe)
|
| 151 |
+
# =========================
|
| 152 |
+
def build_similarity_network_html(query_text: str,
|
| 153 |
+
k_articles: int,
|
| 154 |
+
min_year: str,
|
| 155 |
+
max_year: str,
|
| 156 |
+
use_specter: bool = False,
|
| 157 |
+
alpha_e5: float = 0.6,
|
| 158 |
+
top_nodes: int = 15,
|
| 159 |
+
doc_edge_threshold: float = 0.35) -> str:
|
| 160 |
+
|
| 161 |
+
qtxt = (query_text or "").strip()
|
| 162 |
+
if len(qtxt) < 5:
|
| 163 |
+
return "<p>Escribe un título/idea más descriptiva (≥ 5 caracteres).</p>"
|
| 164 |
+
|
| 165 |
+
# ---- Filtro por años ----
|
| 166 |
+
sub_df = df
|
| 167 |
+
if (min_year or "").strip() or (max_year or "").strip():
|
| 168 |
+
try:
|
| 169 |
+
y0 = int(min_year) if (min_year or "").strip() else None
|
| 170 |
+
y1 = int(max_year) if (max_year or "").strip() else None
|
| 171 |
+
except ValueError:
|
| 172 |
+
y0 = y1 = None
|
| 173 |
+
if y0 is not None:
|
| 174 |
+
sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
|
| 175 |
+
if y1 is not None:
|
| 176 |
+
sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
|
| 177 |
+
if sub_df.empty:
|
| 178 |
+
return "<p>No hay artículos en el rango de años solicitado.</p>"
|
| 179 |
+
|
| 180 |
+
sub_idx = sub_df.index.to_numpy()
|
| 181 |
+
sub_e5 = embeddings[sub_idx]
|
| 182 |
+
|
| 183 |
+
# ---- Similitud a consulta (para tamaño de nodos) ----
|
| 184 |
+
q_e5 = model_e5.encode([qry_prefix + qtxt], normalize_embeddings=True)[0].astype("float32")
|
| 185 |
+
scores_e5 = sub_e5 @ q_e5
|
| 186 |
+
|
| 187 |
+
# Híbrido (opcional)
|
| 188 |
+
ns = scores_e5
|
| 189 |
+
if use_specter and specter_embs is not None:
|
| 190 |
+
spc = specter_embs[sub_idx]
|
| 191 |
+
q_spc = get_specter().encode([qtxt], normalize_embeddings=True)[0].astype("float32")
|
| 192 |
+
scores_spc = spc @ q_spc
|
| 193 |
+
alpha = float(alpha_e5)
|
| 194 |
+
ns = alpha * scores_e5 + (1 - alpha) * scores_spc
|
| 195 |
+
|
| 196 |
+
# Top-k por similitud
|
| 197 |
+
k = min(int(k_articles), len(sub_idx))
|
| 198 |
+
top_idx_local = np.argpartition(-ns, k - 1)[:k]
|
| 199 |
+
top_rows = sub_df.iloc[top_idx_local].copy()
|
| 200 |
+
top_rows["sim_to_query"] = ns[top_idx_local]
|
| 201 |
+
top_rows = top_rows.sort_values("sim_to_query", ascending=False).head(int(top_nodes))
|
| 202 |
+
if len(top_rows) < 2:
|
| 203 |
+
return "<p>No hay suficientes artículos para graficar la red.</p>"
|
| 204 |
+
|
| 205 |
+
node_idx = top_rows.index.to_numpy()
|
| 206 |
+
node_e5 = embeddings[node_idx]
|
| 207 |
+
|
| 208 |
+
# ---- Aristas artículo–artículo ----
|
| 209 |
+
# E5 por defecto; si SPECTER activo y disponible, usarlo para mayor coherencia temática
|
| 210 |
+
pair_mat = node_e5
|
| 211 |
+
if use_specter and specter_embs is not None:
|
| 212 |
+
pair_mat = specter_embs[node_idx]
|
| 213 |
+
pair_sims = pair_mat @ pair_mat.T
|
| 214 |
+
|
| 215 |
+
# ---- Colores por año (teal gradient estilo CP) ----
|
| 216 |
+
years = top_rows["year"].fillna(0).astype(int).to_numpy()
|
| 217 |
+
y_valid = years[years > 0]
|
| 218 |
+
y_min, y_max = (int(y_valid.min()), int(y_valid.max())) if len(y_valid) else (2000, 2025)
|
| 219 |
+
|
| 220 |
+
def teal_year_color(y: int) -> str:
|
| 221 |
+
t = 0.0 if (not y or y <= 0 or y_max == y_min) else (y - y_min) / (y_max - y_min)
|
| 222 |
+
h = 170
|
| 223 |
+
s = int(35 + 35 * t)
|
| 224 |
+
l = int(85 - 30 * t)
|
| 225 |
+
return f"hsl({h}, {s}%, {l}%)"
|
| 226 |
+
|
| 227 |
+
# ---- Comunidades (clusters) para modo color=Comunidad ----
|
| 228 |
+
ids = [str(row.get("eid", idx)) for idx, row in top_rows.iterrows()]
|
| 229 |
+
Gc = nx.Graph()
|
| 230 |
+
Gc.add_nodes_from(ids)
|
| 231 |
+
n = len(ids)
|
| 232 |
+
for i in range(n):
|
| 233 |
+
for j in range(i + 1, n):
|
| 234 |
+
w = float(pair_sims[i, j])
|
| 235 |
+
if w >= float(doc_edge_threshold):
|
| 236 |
+
Gc.add_edge(ids[i], ids[j], weight=w)
|
| 237 |
+
|
| 238 |
+
comms = list(greedy_modularity_communities(Gc, weight="weight")) if Gc.number_of_edges() else [set(ids)]
|
| 239 |
+
node2comm = {nid: ci for ci, c in enumerate(comms) for nid in c}
|
| 240 |
+
|
| 241 |
+
def pastel_palette(k, s=60, l=65):
|
| 242 |
+
return [f"hsl({int(360*i/k)}, {s}%, {l}%)" for i in range(max(1, k))]
|
| 243 |
+
comm_colors = pastel_palette(len(comms))
|
| 244 |
+
group_colors = {str(i): comm_colors[i] for i in range(len(comms))}
|
| 245 |
+
|
| 246 |
+
# ---- Construcción nodos/aristas para vis.js ----
|
| 247 |
+
ns_nodes = top_rows["sim_to_query"].to_numpy(dtype=float)
|
| 248 |
+
smin, smax = (float(ns_nodes.min()), float(ns_nodes.max())) if ns_nodes.size else (0.0, 0.0)
|
| 249 |
+
|
| 250 |
+
def node_size(sim):
|
| 251 |
+
if smax <= smin: return 18
|
| 252 |
+
return 14 + 40 * (float(sim) - smin) / (smax - smin)
|
| 253 |
+
|
| 254 |
+
nodes, edges = [], []
|
| 255 |
+
nodes.append({
|
| 256 |
+
"id": "QUERY", "label": "Consulta", "title": qtxt,
|
| 257 |
+
"shape": "star", "size": 46, "color": "#e45756",
|
| 258 |
+
"font": {"size": 16, "strokeWidth": 6, "strokeColor": "#ffffff"}
|
| 259 |
+
})
|
| 260 |
+
|
| 261 |
+
for _, row in top_rows.iterrows():
|
| 262 |
+
eid = str(row.get("eid", "")) or str(row.name)
|
| 263 |
+
title = str(row.get("title", ""))[:160]
|
| 264 |
+
journal = str(row.get("source_title", ""))[:120]
|
| 265 |
+
year = int(row.get("year", 0)) if pd.notna(row.get("year", None)) else 0
|
| 266 |
+
doi = str(row.get("doi", "")) or ""
|
| 267 |
+
link = str(row.get("link", "")) or ""
|
| 268 |
+
sim = float(row.get("sim_to_query", 0.0))
|
| 269 |
+
|
| 270 |
+
label = (journal or title)[:40] or "Artículo"
|
| 271 |
+
tooltip = (
|
| 272 |
+
f"<b>{title}</b><br>"
|
| 273 |
+
f"Revista: {journal}<br>"
|
| 274 |
+
f"Año: {year if year>0 else 'N/D'}<br>"
|
| 275 |
+
f"Similitud con consulta: {sim:.3f}<br>"
|
| 276 |
+
f"DOI: {doi}<br>"
|
| 277 |
+
f"<a href='{link}' target='_blank'>Abrir</a>"
|
| 278 |
+
)
|
| 279 |
+
group = str(node2comm.get(eid, 0))
|
| 280 |
+
nodes.append({
|
| 281 |
+
"id": eid, "label": label, "title": tooltip,
|
| 282 |
+
"size": node_size(sim), "year": year, "group": group,
|
| 283 |
+
"colorYear": teal_year_color(year),
|
| 284 |
+
"font": {"size": 14, "strokeWidth": 6, "strokeColor": "#ffffff"}
|
| 285 |
+
})
|
| 286 |
+
edges.append({
|
| 287 |
+
"from": "QUERY", "to": eid,
|
| 288 |
+
"value": sim,
|
| 289 |
+
"width": 1 + 6*max(0.0, sim),
|
| 290 |
+
"color": {"color": "#9fb7b3"},
|
| 291 |
+
"smooth": True
|
| 292 |
+
})
|
| 293 |
+
|
| 294 |
+
for i in range(n):
|
| 295 |
+
for j in range(i + 1, n):
|
| 296 |
+
w = float(pair_sims[i, j])
|
| 297 |
+
edges.append({
|
| 298 |
+
"from": ids[i], "to": ids[j],
|
| 299 |
+
"value": w,
|
| 300 |
+
"width": max(0.8, 3.0*(w-0.2)),
|
| 301 |
+
"hidden": w < doc_edge_threshold,
|
| 302 |
+
"color": {"color": "#b9c7c5"},
|
| 303 |
+
"smooth": True
|
| 304 |
+
})
|
| 305 |
+
|
| 306 |
+
options = {
|
| 307 |
+
"interaction": {
|
| 308 |
+
"hover": True, "multiselect": True, "dragNodes": True,
|
| 309 |
+
"navigationButtons": False,
|
| 310 |
+
"keyboard": {"enabled": True, "bindToWindow": True}
|
| 311 |
+
},
|
| 312 |
+
"physics": {
|
| 313 |
+
"enabled": True, "solver": "forceAtlas2Based",
|
| 314 |
+
"forceAtlas2Based": {
|
| 315 |
+
"avoidOverlap": 0.4, "gravitationalConstant": -45,
|
| 316 |
+
"centralGravity": 0.015, "springLength": 135,
|
| 317 |
+
"springConstant": 0.055, "damping": 0.45
|
| 318 |
+
},
|
| 319 |
+
"stabilization": {"iterations": 140}
|
| 320 |
+
},
|
| 321 |
+
"nodes": {
|
| 322 |
+
"shape": "dot", "borderWidth": 1,
|
| 323 |
+
"shadow": {"enabled": True, "size": 8, "x": 0, "y": 1}
|
| 324 |
+
},
|
| 325 |
+
"edges": {
|
| 326 |
+
"smooth": {"type": "continuous"},
|
| 327 |
+
"selectionWidth": 2,
|
| 328 |
+
"shadow": {"enabled": True, "size": 6, "x": 0, "y": 1}
|
| 329 |
+
}
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
tmpl = Template(r"""
|
| 333 |
+
<div style="font-family:system-ui,-apple-system,Segoe UI,Roboto; background:#f6f8f9; padding:8px; border-radius:8px;">
|
| 334 |
+
<div style="display:flex; gap:14px; align-items:center; margin:6px 0 10px 0;">
|
| 335 |
+
<div style="white-space:nowrap;">
|
| 336 |
+
<label><b>Color por:</b></label>
|
| 337 |
+
<label style="margin-left:6px;"><input type="radio" name="colorMode" value="year" checked> Año</label>
|
| 338 |
+
<label style="margin-left:6px;"><input type="radio" name="colorMode" value="community"> Comunidad</label>
|
| 339 |
+
</div>
|
| 340 |
+
<div style="min-width:290px;">
|
| 341 |
+
<label for="edgeSlider"><b>Umbral</b>: <span id="edgeVal">$THRESH</span></label>
|
| 342 |
+
<input id="edgeSlider" type="range" min="0" max="1" step="0.01" value="$THRESH"
|
| 343 |
+
style="width:180px; margin-left:8px;">
|
| 344 |
+
</div>
|
| 345 |
+
</div>
|
| 346 |
+
|
| 347 |
+
<div style="display:flex; align-items:center; gap:10px; margin:2px 0 8px 6px;">
|
| 348 |
+
<div style="width:82px; text-align:right; color:#5b6b70; font-size:12px;">Años:</div>
|
| 349 |
+
<input id="yearMin" type="range" min="$YMIN" max="$YMAX" value="$YMIN" step="1" style="flex:1;">
|
| 350 |
+
<input id="yearMax" type="range" min="$YMIN" max="$YMAX" value="$YMAX" step="1" style="flex:1;">
|
| 351 |
+
<div id="yearLbl" style="width:130px; text-align:left; color:#5b6b70; font-size:12px;">$YMIN – $YMAX</div>
|
| 352 |
+
</div>
|
| 353 |
+
<div style="height:10px; margin:0 6px 8px 90px; background:linear-gradient(90deg, hsl(170,35%,85%) 0%, hsl(170,70%,55%) 100%); border-radius:6px;"></div>
|
| 354 |
+
|
| 355 |
+
<div id="netContainer" style="height:720px; border:1px solid #d6e0e2; border-radius:12px; background:#fbfcfd;"></div>
|
| 356 |
+
|
| 357 |
+
<div style="position:relative; margin-top:6px;">
|
| 358 |
+
<div style="position:absolute; left:6px; bottom:6px; display:flex; gap:8px;">
|
| 359 |
+
<button id="btnFit" title="Ajustar vista" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⟲</button>
|
| 360 |
+
<button id="btnPNG" title="Exportar PNG" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⬇</button>
|
| 361 |
+
<button id="btnHelp" title="Ayuda" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">?</button>
|
| 362 |
+
</div>
|
| 363 |
+
</div>
|
| 364 |
+
</div>
|
| 365 |
+
|
| 366 |
+
<script src="https://unpkg.com/vis-network@9.1.9/dist/vis-network.min.js"></script>
|
| 367 |
+
<script>
|
| 368 |
+
(function(){
|
| 369 |
+
const nodes = new vis.DataSet($NODES);
|
| 370 |
+
const edges = new vis.DataSet($EDGES);
|
| 371 |
+
const options = $OPTIONS;
|
| 372 |
+
const groupColors = $GROUPCOLORS;
|
| 373 |
+
|
| 374 |
+
const container = document.getElementById('netContainer');
|
| 375 |
+
const net = new vis.Network(container, {nodes, edges}, options);
|
| 376 |
+
window.network = net; window.nodes = nodes; window.edges = edges;
|
| 377 |
+
|
| 378 |
+
// Color por año/comunidad
|
| 379 |
+
function applyColors(mode){
|
| 380 |
+
nodes.forEach(n=>{
|
| 381 |
+
if(n.id==='QUERY') return;
|
| 382 |
+
const col = (mode==='community') ? (groupColors[String(n.group)]||'#9fb7b3') : (n.colorYear||'#9fb7b3');
|
| 383 |
+
nodes.update({ id:n.id, color: col });
|
| 384 |
+
});
|
| 385 |
+
}
|
| 386 |
+
applyColors('year');
|
| 387 |
+
document.querySelectorAll('input[name="colorMode"]').forEach(r =>
|
| 388 |
+
r.addEventListener('change', e => applyColors(e.target.value))
|
| 389 |
+
);
|
| 390 |
+
|
| 391 |
+
// Umbral
|
| 392 |
+
const slider = document.getElementById('edgeSlider');
|
| 393 |
+
const edgeVal = document.getElementById('edgeVal');
|
| 394 |
+
function applyThreshold(th){
|
| 395 |
+
edges.forEach(e=>{
|
| 396 |
+
const show = (e.value||0) >= th || e.from==='QUERY' || e.to==='QUERY';
|
| 397 |
+
edges.update({ id:e.id, hidden:!show, width: show ? (e.width||1) : 0.1 });
|
| 398 |
+
});
|
| 399 |
+
}
|
| 400 |
+
slider.addEventListener('input', ()=>{
|
| 401 |
+
const th = parseFloat(slider.value||'0');
|
| 402 |
+
edgeVal.textContent = th.toFixed(2);
|
| 403 |
+
applyThreshold(th);
|
| 404 |
+
});
|
| 405 |
+
applyThreshold(parseFloat(slider.value||'0'));
|
| 406 |
+
|
| 407 |
+
// Timeline (doble slider)
|
| 408 |
+
const sMin=document.getElementById('yearMin'), sMax=document.getElementById('yearMax');
|
| 409 |
+
const yLbl=document.getElementById('yearLbl');
|
| 410 |
+
function applyYearFilter(a,b){
|
| 411 |
+
const lo=Math.min(a,b), hi=Math.max(a,b);
|
| 412 |
+
yLbl.textContent = lo+" – "+hi;
|
| 413 |
+
const visible=new Set();
|
| 414 |
+
nodes.forEach(n=>{
|
| 415 |
+
if(n.id==='QUERY'){ visible.add(n.id); return; }
|
| 416 |
+
const y=Number(n.year||0);
|
| 417 |
+
const show=(y===0)||(y>=lo && y<=hi);
|
| 418 |
+
nodes.update({ id:n.id, hidden:!show });
|
| 419 |
+
if(show) visible.add(n.id);
|
| 420 |
+
});
|
| 421 |
+
edges.forEach(e=>{
|
| 422 |
+
const show=visible.has(e.from)&&visible.has(e.to);
|
| 423 |
+
edges.update({ id:e.id, hidden:!show });
|
| 424 |
+
});
|
| 425 |
+
}
|
| 426 |
+
function clamp(){ let a=+sMin.value, b=+sMax.value; if(a>b) [a,b]=[b,a]; applyYearFilter(a,b); }
|
| 427 |
+
sMin.addEventListener('input',clamp); sMax.addEventListener('input',clamp); clamp();
|
| 428 |
+
|
| 429 |
+
// Resaltado de vecindad
|
| 430 |
+
const inactive='rgba(200,210,210,0.35)';
|
| 431 |
+
function highlight(ids){
|
| 432 |
+
const nbr=new Set(ids);
|
| 433 |
+
ids.forEach(id=> net.getConnectedNodes(id).forEach(n=>nbr.add(n)));
|
| 434 |
+
nodes.forEach(n=>{
|
| 435 |
+
const active=nbr.has(n.id)||n.id==='QUERY';
|
| 436 |
+
nodes.update({ id:n.id, color: active?(n.color||'#9fb7b3'):inactive });
|
| 437 |
+
});
|
| 438 |
+
}
|
| 439 |
+
net.on('selectNode', p=>highlight(p.nodes));
|
| 440 |
+
net.on('deselectNode', ()=>applyColors(document.querySelector('input[name="colorMode"]:checked').value));
|
| 441 |
+
|
| 442 |
+
// Botones
|
| 443 |
+
document.getElementById('btnFit').onclick = () => net.fit({animation: true});
|
| 444 |
+
document.getElementById('btnPNG').onclick = () => {
|
| 445 |
+
const url = net.canvas.frame.canvas.toDataURL('image/png');
|
| 446 |
+
const a = document.createElement('a'); a.href = url; a.download = 'graph.png'; a.click();
|
| 447 |
+
};
|
| 448 |
+
document.getElementById('btnHelp').onclick = () => alert(
|
| 449 |
+
"Usa: Color por Año/Comunidad • Umbral de arista • Rango de años • Clic para resaltar vecindad • Doble clic abre el enlace (tooltip)."
|
| 450 |
+
);
|
| 451 |
+
|
| 452 |
+
// Doble clic abre enlace/DOI si existe
|
| 453 |
+
net.on('doubleClick', (p) => {
|
| 454 |
+
if (p.nodes && p.nodes.length===1){
|
| 455 |
+
const n = nodes.get(p.nodes[0]);
|
| 456 |
+
if (n && n.title) {
|
| 457 |
+
const tmp = document.createElement('div'); tmp.innerHTML = n.title;
|
| 458 |
+
const a = tmp.querySelector('a'); if (a && a.href) window.open(a.href, '_blank');
|
| 459 |
+
}
|
| 460 |
+
}
|
| 461 |
+
});
|
| 462 |
+
})();
|
| 463 |
+
</script>
|
| 464 |
+
""")
|
| 465 |
+
|
| 466 |
+
html = tmpl.substitute(
|
| 467 |
+
NODES=json.dumps(nodes),
|
| 468 |
+
EDGES=json.dumps(edges),
|
| 469 |
+
OPTIONS=json.dumps(options),
|
| 470 |
+
GROUPCOLORS=json.dumps(group_colors),
|
| 471 |
+
YMIN=y_min,
|
| 472 |
+
YMAX=y_max,
|
| 473 |
+
THRESH=f"{doc_edge_threshold:.2f}",
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
b64 = base64.b64encode(html.encode("utf-8")).decode("ascii")
|
| 477 |
+
return (
|
| 478 |
+
f'<iframe src="data:text/html;charset=utf-8;base64,{b64}" '
|
| 479 |
+
f'style="width:100%;height:820px;border:0;" '
|
| 480 |
+
f'sandbox="allow-scripts allow-same-origin allow-popups"></iframe>'
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# =========================
|
| 484 |
+
# UI Gradio
|
| 485 |
+
# =========================
|
| 486 |
+
with gr.Blocks(title="Recomendador de Revistas (Scopus)") as demo:
|
| 487 |
+
gr.Markdown("## Recomendación de revistas UPTC")
|
| 488 |
+
|
| 489 |
+
# --- Entrada principal ---
|
| 490 |
+
with gr.Row():
|
| 491 |
+
query = gr.Textbox(
|
| 492 |
+
label="Título o idea de investigación",
|
| 493 |
+
lines=3,
|
| 494 |
+
placeholder="Ej.: Detección temprana de fallas en motores usando aprendizaje profundo…"
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
# --- Filtros de año ---
|
| 498 |
+
with gr.Row():
|
| 499 |
+
min_year = gr.Textbox(label="Año mínimo (opcional)", placeholder="2019")
|
| 500 |
+
max_year = gr.Textbox(label="Año máximo (opcional)", placeholder="2025")
|
| 501 |
+
|
| 502 |
+
# --- Top-k y nº de revistas ---
|
| 503 |
+
with gr.Row():
|
| 504 |
+
k_articles = gr.Slider(50, 1000, value=300, step=50, label="Artículos considerados (top-k)")
|
| 505 |
+
top_n = gr.Slider(5, 20, value=10, step=1, label="Nº de revistas a mostrar")
|
| 506 |
+
|
| 507 |
+
# --- Fusionar con SPECTER ---
|
| 508 |
+
with gr.Row():
|
| 509 |
+
use_specter = gr.Checkbox(
|
| 510 |
+
label="Fusionar con SPECTER (mejor afinidad científica)",
|
| 511 |
+
value=SPECTER_AVAILABLE
|
| 512 |
+
)
|
| 513 |
+
alpha_e5 = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Peso E5 (1−α = SPECTER)")
|
| 514 |
+
|
| 515 |
+
# --- BOTONES: SIEMPRE DEBAJO DE FUSIÓN ---
|
| 516 |
+
with gr.Row():
|
| 517 |
+
btn = gr.Button("Recomendar")
|
| 518 |
+
btn_net = gr.Button("Ver red de similitud")
|
| 519 |
+
|
| 520 |
+
# --- SALIDAS (van DESPUÉS, así los botones quedan fijos arriba de ellas) ---
|
| 521 |
+
out = gr.Dataframe(
|
| 522 |
+
row_count=10, wrap=True,
|
| 523 |
+
column_widths=[180, 60, 90, 90, 90, 90, 90, 250, 120, 120, 120, 100],
|
| 524 |
+
label="Revistas recomendadas"
|
| 525 |
+
)
|
| 526 |
+
out_net_html = gr.HTML(label="Grafo interactivo (explorable)")
|
| 527 |
+
|
| 528 |
+
# --- Acciones (pueden declararse después de crear 'out' y 'out_net_html') ---
|
| 529 |
+
btn.click(
|
| 530 |
+
fn=recommend,
|
| 531 |
+
inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
|
| 532 |
+
outputs=out
|
| 533 |
+
)
|
| 534 |
+
query.submit(
|
| 535 |
+
fn=recommend,
|
| 536 |
+
inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
|
| 537 |
+
outputs=out
|
| 538 |
+
)
|
| 539 |
+
btn_net.click(
|
| 540 |
+
fn=lambda q, ka, y0, y1, us, a: build_similarity_network_html(
|
| 541 |
+
q, ka, y0, y1, use_specter=us, alpha_e5=a, top_nodes=15, doc_edge_threshold=0.35
|
| 542 |
+
),
|
| 543 |
+
inputs=[query, k_articles, min_year, max_year, use_specter, alpha_e5],
|
| 544 |
+
outputs=[out_net_html]
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
if __name__ == "__main__":
|
| 548 |
+
demo.launch()
|
scopus.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scopus_simple_extract.py
|
| 2 |
+
# Extrae resultados Scopus por AF-ID y exporta UN CSV "amigable" con campos básicos.
|
| 3 |
+
# NO pide Abstract, Autores, Keywords, Funding, Conference, etc.
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
import argparse
|
| 7 |
+
import urllib.parse as urlparse
|
| 8 |
+
from typing import Dict, List, Optional
|
| 9 |
+
import requests
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
BASE_URL_SEARCH = "https://api.elsevier.com/content/search/scopus"
|
| 14 |
+
|
| 15 |
+
# -------------------------
|
| 16 |
+
# HTTP utilidades
|
| 17 |
+
# -------------------------
|
| 18 |
+
def build_headers(api_key: str, insttoken: Optional[str] = None) -> Dict[str, str]:
|
| 19 |
+
h = {"Accept": "application/json", "X-ELS-APIKey": api_key.strip()}
|
| 20 |
+
if insttoken:
|
| 21 |
+
h["X-ELS-Insttoken"] = insttoken.strip()
|
| 22 |
+
return h
|
| 23 |
+
|
| 24 |
+
def get_json(session: requests.Session, url: str, params: Dict[str, str],
|
| 25 |
+
headers: Dict[str, str], max_retries: int = 6, sleep_base: float = 0.75) -> Dict:
|
| 26 |
+
"""
|
| 27 |
+
GET con reintentos para 429/5xx. Si 401 por Insttoken mal pareado, reintenta SIN Insttoken.
|
| 28 |
+
"""
|
| 29 |
+
last_exc = None
|
| 30 |
+
tried_without_token = False
|
| 31 |
+
|
| 32 |
+
for t in range(max_retries + 1):
|
| 33 |
+
try:
|
| 34 |
+
r = session.get(url, params=params, headers=headers, timeout=90)
|
| 35 |
+
except Exception as ex:
|
| 36 |
+
last_exc = ex
|
| 37 |
+
time.sleep((2 ** t) * sleep_base)
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
if r.status_code in (429, 500, 502, 503, 504):
|
| 41 |
+
time.sleep((2 ** t) * sleep_base)
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
if r.status_code == 401:
|
| 45 |
+
# intentar una sola vez sin Insttoken si el problema es token no asociado
|
| 46 |
+
try:
|
| 47 |
+
j = r.json()
|
| 48 |
+
except Exception:
|
| 49 |
+
j = {}
|
| 50 |
+
if ("Institution Token is not associated with API Key" in str(j)
|
| 51 |
+
and not tried_without_token
|
| 52 |
+
and "X-ELS-Insttoken" in headers):
|
| 53 |
+
tried_without_token = True
|
| 54 |
+
h2 = dict(headers)
|
| 55 |
+
h2.pop("X-ELS-Insttoken", None)
|
| 56 |
+
r2 = session.get(url, params=params, headers=h2, timeout=90)
|
| 57 |
+
if r2.ok:
|
| 58 |
+
try:
|
| 59 |
+
return r2.json()
|
| 60 |
+
except Exception:
|
| 61 |
+
raise RuntimeError("La respuesta no es JSON decodificable.")
|
| 62 |
+
else:
|
| 63 |
+
try:
|
| 64 |
+
j2 = r2.json()
|
| 65 |
+
except Exception:
|
| 66 |
+
j2 = {}
|
| 67 |
+
raise RuntimeError(f"HTTP {r2.status_code} – {j2 or r2.text}")
|
| 68 |
+
|
| 69 |
+
if not r.ok:
|
| 70 |
+
try:
|
| 71 |
+
j = r.json()
|
| 72 |
+
except Exception:
|
| 73 |
+
j = {}
|
| 74 |
+
raise RuntimeError(f"HTTP {r.status_code} – {j or r.text}")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
return r.json()
|
| 78 |
+
except Exception:
|
| 79 |
+
raise RuntimeError("La respuesta no es JSON decodificable.")
|
| 80 |
+
|
| 81 |
+
if last_exc:
|
| 82 |
+
raise RuntimeError(f"Error de red persistente: {last_exc}")
|
| 83 |
+
raise RuntimeError("No se obtuvo respuesta estable tras varios reintentos.")
|
| 84 |
+
|
| 85 |
+
# -------------------------
|
| 86 |
+
# Paginación Search API
|
| 87 |
+
# -------------------------
|
| 88 |
+
def extract_by_year_cursor(session: requests.Session, headers: Dict[str, str],
|
| 89 |
+
afid: str, year: int, page_size: int, view: str) -> List[Dict]:
|
| 90 |
+
params = {
|
| 91 |
+
"query": f"AF-ID({afid}) AND PUBYEAR = {year}",
|
| 92 |
+
"view": view,
|
| 93 |
+
"count": str(page_size),
|
| 94 |
+
"cursor": "*",
|
| 95 |
+
}
|
| 96 |
+
entries: List[Dict] = []
|
| 97 |
+
while True:
|
| 98 |
+
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
| 99 |
+
chunk = j.get("search-results", {}).get("entry", []) or []
|
| 100 |
+
if chunk:
|
| 101 |
+
entries.extend(chunk)
|
| 102 |
+
|
| 103 |
+
next_token = None
|
| 104 |
+
for ln in j.get("search-results", {}).get("link", []) or []:
|
| 105 |
+
if ln.get("@ref") == "next":
|
| 106 |
+
href = ln.get("@href")
|
| 107 |
+
if href:
|
| 108 |
+
q = urlparse.urlparse(href).query
|
| 109 |
+
qd = urlparse.parse_qs(q)
|
| 110 |
+
next_token = (qd.get("cursor") or [None])[0]
|
| 111 |
+
break
|
| 112 |
+
if not next_token:
|
| 113 |
+
break
|
| 114 |
+
params["cursor"] = next_token
|
| 115 |
+
return entries
|
| 116 |
+
|
| 117 |
+
def extract_by_year_startcount(session: requests.Session, headers: Dict[str, str],
|
| 118 |
+
afid: str, year: int, page_size: int, view: str,
|
| 119 |
+
hard_limit: int = 20000) -> List[Dict]:
|
| 120 |
+
entries: List[Dict] = []
|
| 121 |
+
start = 0
|
| 122 |
+
while start < hard_limit:
|
| 123 |
+
params = {
|
| 124 |
+
"query": f"AF-ID({afid}) AND PUBYEAR = {year}",
|
| 125 |
+
"view": view,
|
| 126 |
+
"count": str(page_size),
|
| 127 |
+
"start": str(start),
|
| 128 |
+
}
|
| 129 |
+
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
| 130 |
+
chunk = j.get("search-results", {}).get("entry", []) or []
|
| 131 |
+
if not chunk:
|
| 132 |
+
break
|
| 133 |
+
entries.extend(chunk)
|
| 134 |
+
if len(chunk) < page_size:
|
| 135 |
+
break
|
| 136 |
+
start += page_size
|
| 137 |
+
return entries
|
| 138 |
+
|
| 139 |
+
def extract_no_year(session: requests.Session, headers: Dict[str, str],
|
| 140 |
+
afid: str, page_size: int, view: str, use_cursor: bool) -> List[Dict]:
|
| 141 |
+
entries: List[Dict] = []
|
| 142 |
+
if use_cursor:
|
| 143 |
+
params = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "cursor": "*"}
|
| 144 |
+
while True:
|
| 145 |
+
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
| 146 |
+
chunk = j.get("search-results", {}).get("entry", []) or []
|
| 147 |
+
if chunk:
|
| 148 |
+
entries.extend(chunk)
|
| 149 |
+
next_token = None
|
| 150 |
+
for ln in j.get("search-results", {}).get("link", []) or []:
|
| 151 |
+
if ln.get("@ref") == "next":
|
| 152 |
+
href = ln.get("@href")
|
| 153 |
+
if href:
|
| 154 |
+
q = urlparse.urlparse(href).query
|
| 155 |
+
qd = urlparse.parse_qs(q)
|
| 156 |
+
next_token = (qd.get("cursor") or [None])[0]
|
| 157 |
+
break
|
| 158 |
+
if not next_token:
|
| 159 |
+
break
|
| 160 |
+
params["cursor"] = next_token
|
| 161 |
+
else:
|
| 162 |
+
start = 0
|
| 163 |
+
while True:
|
| 164 |
+
params_sc = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "start": str(start)}
|
| 165 |
+
j = get_json(session, BASE_URL_SEARCH, params_sc, headers)
|
| 166 |
+
chunk = j.get("search-results", {}).get("entry", []) or []
|
| 167 |
+
if not chunk:
|
| 168 |
+
break
|
| 169 |
+
entries.extend(chunk)
|
| 170 |
+
if len(chunk) < page_size:
|
| 171 |
+
break
|
| 172 |
+
start += page_size
|
| 173 |
+
return entries
|
| 174 |
+
|
| 175 |
+
# -------------------------
|
| 176 |
+
# Normalización básica (sin autores/abstract/keywords/funding/etc.)
|
| 177 |
+
# -------------------------
|
| 178 |
+
TOP_FIELD_MAP = {
|
| 179 |
+
"dc:title": "title",
|
| 180 |
+
# NO pedimos abstract ni keywords
|
| 181 |
+
"prism:coverDate": "coverDate",
|
| 182 |
+
"prism:doi": "doi",
|
| 183 |
+
"prism:publicationName": "sourceTitle",
|
| 184 |
+
"prism:issn": "issn",
|
| 185 |
+
"prism:eIssn": "eIssn",
|
| 186 |
+
"prism:volume": "volume",
|
| 187 |
+
"prism:issueIdentifier": "issue",
|
| 188 |
+
"prism:pageRange": "pages",
|
| 189 |
+
"citedby-count": "citedBy",
|
| 190 |
+
"subtype": "subtype",
|
| 191 |
+
"subtypeDescription": "subtypeDesc",
|
| 192 |
+
"openaccessFlag": "openAccess",
|
| 193 |
+
"dc:identifier": "identifier",
|
| 194 |
+
"eid": "eid",
|
| 195 |
+
"prism:url": "prismUrl",
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
def links_to_dict(links: List[Dict]) -> Dict[str, str]:
|
| 199 |
+
d = {}
|
| 200 |
+
for ln in links or []:
|
| 201 |
+
ref = ln.get("@ref")
|
| 202 |
+
href = ln.get("@href")
|
| 203 |
+
if ref and href:
|
| 204 |
+
d[f"link_{ref}"] = href
|
| 205 |
+
return d
|
| 206 |
+
|
| 207 |
+
def normalize_entries(entries: List[Dict]) -> pd.DataFrame:
|
| 208 |
+
rows: List[Dict] = []
|
| 209 |
+
for e in entries:
|
| 210 |
+
row = {}
|
| 211 |
+
for k_src, k_dst in TOP_FIELD_MAP.items():
|
| 212 |
+
if k_src in e:
|
| 213 |
+
row[k_dst] = e.get(k_src)
|
| 214 |
+
row.update(links_to_dict(e.get("link")))
|
| 215 |
+
rows.append(row)
|
| 216 |
+
|
| 217 |
+
df = pd.DataFrame(rows)
|
| 218 |
+
if not df.empty:
|
| 219 |
+
if "coverDate" in df.columns:
|
| 220 |
+
df["coverDate"] = pd.to_datetime(df["coverDate"], errors="coerce")
|
| 221 |
+
subset_cols = [c for c in ["eid", "identifier"] if c in df.columns]
|
| 222 |
+
if subset_cols:
|
| 223 |
+
df = df.drop_duplicates(subset=subset_cols, keep="first")
|
| 224 |
+
return df
|
| 225 |
+
|
| 226 |
+
# -------------------------
|
| 227 |
+
# Fallbacks de vista/paginación
|
| 228 |
+
# -------------------------
|
| 229 |
+
def try_extract_year(session, headers, afid, year, page_size, view, use_cursor) -> List[Dict]:
|
| 230 |
+
def do_extract(ps, cur, v):
|
| 231 |
+
if cur:
|
| 232 |
+
return extract_by_year_cursor(session, headers, afid, year, ps, v)
|
| 233 |
+
else:
|
| 234 |
+
return extract_by_year_startcount(session, headers, afid, year, ps, v)
|
| 235 |
+
try:
|
| 236 |
+
return do_extract(page_size, use_cursor, view)
|
| 237 |
+
except RuntimeError as e:
|
| 238 |
+
msg = str(e)
|
| 239 |
+
if "AUTHORIZATION_ERROR" in msg:
|
| 240 |
+
fallback = "STANDARD" if view == "COMPLETE" else ("BASIC" if view == "STANDARD" else None)
|
| 241 |
+
if fallback:
|
| 242 |
+
return do_extract(page_size, use_cursor, fallback)
|
| 243 |
+
raise
|
| 244 |
+
if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
|
| 245 |
+
# reduce page size y quita cursor
|
| 246 |
+
return do_extract(25, False, view)
|
| 247 |
+
if use_cursor:
|
| 248 |
+
return do_extract(page_size, False, view)
|
| 249 |
+
raise
|
| 250 |
+
|
| 251 |
+
def try_extract_no_year(session, headers, afid, page_size, view, use_cursor) -> List[Dict]:
|
| 252 |
+
try:
|
| 253 |
+
return extract_no_year(session, headers, afid, page_size, view, use_cursor)
|
| 254 |
+
except RuntimeError as e:
|
| 255 |
+
msg = str(e)
|
| 256 |
+
if "AUTHORIZATION_ERROR" in msg:
|
| 257 |
+
if view == "COMPLETE": # bajar a STANDARD/BASIC
|
| 258 |
+
return extract_no_year(session, headers, afid, page_size, "STANDARD", use_cursor)
|
| 259 |
+
if view == "STANDARD":
|
| 260 |
+
return extract_no_year(session, headers, afid, page_size, "BASIC", use_cursor)
|
| 261 |
+
raise
|
| 262 |
+
if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
|
| 263 |
+
return extract_no_year(session, headers, afid, 25, view, False)
|
| 264 |
+
if use_cursor:
|
| 265 |
+
return extract_no_year(session, headers, afid, page_size, view, False)
|
| 266 |
+
raise
|
| 267 |
+
|
| 268 |
+
def fetch_scopus_affiliation(api_key: str,
|
| 269 |
+
afid: str = "60077378",
|
| 270 |
+
year_start: Optional[int] = 2020,
|
| 271 |
+
year_end: Optional[int] = 2024,
|
| 272 |
+
view: str = "STANDARD",
|
| 273 |
+
page_size: int = 100,
|
| 274 |
+
insttoken: Optional[str] = None,
|
| 275 |
+
use_cursor: bool = True) -> List[Dict]:
|
| 276 |
+
headers = build_headers(api_key, insttoken)
|
| 277 |
+
session = requests.Session()
|
| 278 |
+
if year_start is None or year_end is None:
|
| 279 |
+
return try_extract_no_year(session, headers, afid, page_size, view, use_cursor)
|
| 280 |
+
entries: List[Dict] = []
|
| 281 |
+
for yr in range(int(year_start), int(year_end) + 1):
|
| 282 |
+
entries.extend(try_extract_year(session, headers, afid, yr, page_size, view, use_cursor))
|
| 283 |
+
return entries
|
| 284 |
+
|
| 285 |
+
# -------------------------
|
| 286 |
+
# Export UN SOLO CSV (ligero)
|
| 287 |
+
# -------------------------
|
| 288 |
+
EXPORT_COLUMNS = [
|
| 289 |
+
"Title","Year","Source title","Volume","Issue",
|
| 290 |
+
"Page start","Page end","Page count",
|
| 291 |
+
"Cited by","DOI","Link","ISSN","eISSN","Document Type","Open Access","EID"
|
| 292 |
+
]
|
| 293 |
+
|
| 294 |
+
def _pick_link(row: pd.Series) -> str:
|
| 295 |
+
for c in ("prismUrl","link_scopus","prism:url","link_self"):
|
| 296 |
+
if c in row and pd.notna(row[c]) and str(row[c]).strip():
|
| 297 |
+
return str(row[c])
|
| 298 |
+
return ""
|
| 299 |
+
|
| 300 |
+
def pick_col(df: pd.DataFrame, primary: str, secondary: str, default: str = "") -> pd.Series:
|
| 301 |
+
"""Fallback por fila: usa primary; si está vacío/NaN, toma secondary."""
|
| 302 |
+
n = len(df)
|
| 303 |
+
s1 = df[primary] if primary in df.columns else pd.Series([np.nan] * n, index=df.index)
|
| 304 |
+
s2 = df[secondary] if secondary in df.columns else pd.Series([default] * n, index=df.index)
|
| 305 |
+
s1 = s1.copy()
|
| 306 |
+
mask = s1.isna() | (s1.astype(str).str.strip() == "")
|
| 307 |
+
s1.loc[mask] = s2.loc[mask]
|
| 308 |
+
return s1.fillna(default)
|
| 309 |
+
|
| 310 |
+
def make_export(df: pd.DataFrame) -> pd.DataFrame:
|
| 311 |
+
# Derivar Year y páginas
|
| 312 |
+
out = df.copy()
|
| 313 |
+
|
| 314 |
+
if "coverDate" in out.columns:
|
| 315 |
+
out["Year"] = pd.to_datetime(out["coverDate"], errors="coerce").dt.year
|
| 316 |
+
else:
|
| 317 |
+
out["Year"] = ""
|
| 318 |
+
|
| 319 |
+
out["Page start"], out["Page end"], out["Page count"] = "", "", ""
|
| 320 |
+
if "pages" in out.columns:
|
| 321 |
+
starts, ends, counts = [], [], []
|
| 322 |
+
for x in out["pages"].fillna(""):
|
| 323 |
+
if "-" in x:
|
| 324 |
+
a, b = x.split("-", 1)
|
| 325 |
+
a_num = "".join(ch for ch in a if ch.isdigit())
|
| 326 |
+
b_num = "".join(ch for ch in b if ch.isdigit())
|
| 327 |
+
starts.append(a_num); ends.append(b_num)
|
| 328 |
+
try:
|
| 329 |
+
counts.append(str(max(0, int(b_num) - int(a_num) + 1)) if a_num and b_num else "")
|
| 330 |
+
except Exception:
|
| 331 |
+
counts.append("")
|
| 332 |
+
else:
|
| 333 |
+
starts.append(""); ends.append(""); counts.append("")
|
| 334 |
+
out["Page start"], out["Page end"], out["Page count"] = starts, ends, counts
|
| 335 |
+
|
| 336 |
+
# Link preferido
|
| 337 |
+
out["Link"] = out.apply(_pick_link, axis=1)
|
| 338 |
+
|
| 339 |
+
# Ensamblar columnas finales (usando pick_col para evitar 'Series' ambiguas)
|
| 340 |
+
final = pd.DataFrame()
|
| 341 |
+
final["Title"] = out.get("title", "")
|
| 342 |
+
final["Year"] = out.get("Year", "")
|
| 343 |
+
|
| 344 |
+
final["Source title"] = pick_col(out, "sourceTitle", "prism:publicationName")
|
| 345 |
+
final["Volume"] = pick_col(out, "volume", "prism:volume")
|
| 346 |
+
final["Issue"] = pick_col(out, "issue", "prism:issueIdentifier")
|
| 347 |
+
|
| 348 |
+
final["Page start"] = out["Page start"]
|
| 349 |
+
final["Page end"] = out["Page end"]
|
| 350 |
+
final["Page count"] = out["Page count"]
|
| 351 |
+
|
| 352 |
+
final["Cited by"] = pick_col(out, "citedBy", "citedby-count")
|
| 353 |
+
final["DOI"] = pick_col(out, "doi", "prism:doi")
|
| 354 |
+
final["Link"] = out["Link"]
|
| 355 |
+
|
| 356 |
+
final["ISSN"] = pick_col(out, "issn", "prism:issn")
|
| 357 |
+
final["eISSN"] = pick_col(out, "eIssn", "prism:eIssn")
|
| 358 |
+
|
| 359 |
+
final["Document Type"] = pick_col(out, "subtypeDesc", "subtypeDescription")
|
| 360 |
+
final["Open Access"] = pick_col(out, "openAccess", "openaccessFlag")
|
| 361 |
+
|
| 362 |
+
final["EID"] = out.get("eid", "")
|
| 363 |
+
|
| 364 |
+
# Ordenar por año descendente (coaccionando a numérico para evitar mezclas str/int)
|
| 365 |
+
final["Year"] = pd.to_numeric(final["Year"], errors="coerce")
|
| 366 |
+
final = final.sort_values(by="Year", ascending=False, na_position="last")
|
| 367 |
+
|
| 368 |
+
# Reordenar/filtrar columnas
|
| 369 |
+
return final[EXPORT_COLUMNS]
|
| 370 |
+
|
| 371 |
+
# -------------------------
|
| 372 |
+
# CLI
|
| 373 |
+
# -------------------------
|
| 374 |
+
def parse_args():
|
| 375 |
+
p = argparse.ArgumentParser(description="Extrae publicaciones Scopus por AF-ID y exporta UN CSV básico (sin autores/abstract/etc.).")
|
| 376 |
+
p.add_argument("--api-key", required=True, help="X-ELS-APIKey")
|
| 377 |
+
p.add_argument("--insttoken", default=None, help="X-ELS-Insttoken (opcional)")
|
| 378 |
+
p.add_argument("--afid", default="60077378", help="Scopus Affiliation ID (AF-ID)")
|
| 379 |
+
p.add_argument("--year-start", default=2020, help="Año inicial o 'None'")
|
| 380 |
+
p.add_argument("--year-end", default=2024, help="Año final o 'None'")
|
| 381 |
+
p.add_argument("--view", default="STANDARD", choices=["BASIC", "STANDARD", "COMPLETE"], help="Vista del Search API")
|
| 382 |
+
p.add_argument("--page-size", type=int, default=100, help="Tamaño de página (25..200)")
|
| 383 |
+
p.add_argument("--use-cursor", action="store_true", help="Usar cursor pagination")
|
| 384 |
+
p.add_argument("--no-cursor", dest="use_cursor", action="store_false", help="Usar start/count")
|
| 385 |
+
p.set_defaults(use_cursor=True)
|
| 386 |
+
p.add_argument("--out-prefix", default="scopus_afid", help="Prefijo de salida")
|
| 387 |
+
return p.parse_args()
|
| 388 |
+
|
| 389 |
+
def main():
|
| 390 |
+
args = parse_args()
|
| 391 |
+
|
| 392 |
+
def norm_year(x):
|
| 393 |
+
sx = str(x).strip().lower()
|
| 394 |
+
return None if sx == "none" else int(x)
|
| 395 |
+
y0 = norm_year(args.year_start)
|
| 396 |
+
y1 = norm_year(args.year_end)
|
| 397 |
+
|
| 398 |
+
print("Descargando desde Scopus (Search API)…")
|
| 399 |
+
entries = fetch_scopus_affiliation(
|
| 400 |
+
api_key=args.api_key,
|
| 401 |
+
afid=args.afid,
|
| 402 |
+
year_start=y0,
|
| 403 |
+
year_end=y1,
|
| 404 |
+
view=args.view,
|
| 405 |
+
page_size=args.page_size,
|
| 406 |
+
insttoken=args.insttoken,
|
| 407 |
+
use_cursor=args.use_cursor
|
| 408 |
+
)
|
| 409 |
+
print(f"Entradas obtenidas: {len(entries)}")
|
| 410 |
+
|
| 411 |
+
df = normalize_entries(entries)
|
| 412 |
+
export_df = make_export(df)
|
| 413 |
+
out_csv = f"{args.out_prefix}_scopus_export.csv"
|
| 414 |
+
export_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
|
| 415 |
+
print(f"Listo: {out_csv}")
|
| 416 |
+
|
| 417 |
+
if __name__ == "__main__":
|
| 418 |
+
main()
|
scopus_corpus.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:149ba9d029b6969dedcab02f95d6c7c77897fb7470581a3e437d545ea3af2530
|
| 3 |
+
size 1443176
|
scopus_corpus_with_specter.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7442d3f19ba7f2d7685506abf8030951ed1486628b5960ff21eea009f9b533c9
|
| 3 |
+
size 4668034
|