Spaces:

Romanes
/

Revistas

Runtime error

App Files Files Community

Romanes commited on Sep 14

Commit

40fe9ab

verified ·

1 Parent(s): 3ab07d0

Upload 6 files

Browse files

Files changed (6) hide show

build_parquet_embeddings.py +77 -0
build_science_embeddings.py +68 -0
journal_recommender_app.py +548 -0
scopus.py +418 -0
scopus_corpus.parquet +3 -0
scopus_corpus_with_specter.parquet +3 -0

build_parquet_embeddings.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import argparse
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+def ensure_col(df: pd.DataFrame, name: str):
+    if name not in df.columns:
+        df[name] = ""
+    return df
+def main():
+    p = argparse.ArgumentParser(description="Construye un Parquet con embeddings E5 para recomendación de revistas.")
+    p.add_argument("--csv", required=True, help="Ruta al CSV exportado (ej. uptc_afid60077378_scopus_export.csv)")
+    p.add_argument("--out", default="scopus_corpus.parquet", help="Ruta de salida Parquet")
+    p.add_argument("--model", default="intfloat/multilingual-e5-small", help="Modelo Sentence-Transformers")
+    p.add_argument("--batch-size", type=int, default=64, help="Tamaño de batch para el encode")
+    args = p.parse_args()
+    df = pd.read_csv(args.csv)
+    # Asegurar columnas mínimas del export "simple"
+    for c in ["Title","Source title","ISSN","eISSN","Year","Cited by","DOI","Link","EID","Document Type","Open Access"]:
+        ensure_col(df, c)
+    # Texto para similitud: funciona aunque no haya Abstract/Keywords
+    # Usamos título + (revista como contexto suave)
+    df["text_for_match"] = (
+        df["Title"].fillna("").astype(str).str.strip()
+        + ". Revista: "
+        + df["Source title"].fillna("").astype(str).str.strip()
+    ).str.replace(r"\s+", " ", regex=True).str.strip()
+    # Cargar modelo
+    print(f"Cargando modelo: {args.model}")
+    model = SentenceTransformer(args.model, device="cpu")
+    # Prefijo E5: "passage: " para el corpus
+    texts = ["passage: " + t if t else "passage: " for t in df["text_for_match"].tolist()]
+    print(f"Codificando {len(texts)} textos…")
+    embs = model.encode(
+        texts,
+        batch_size=args.batch_size,
+        show_progress_bar=True,
+        normalize_embeddings=True,  # importantes para coseno con producto punto
+    ).astype(np.float32)
+    # Normalizaciones de tipos
+    year = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
+    cited = pd.to_numeric(df["Cited by"], errors="coerce").fillna(0).astype(np.int32)
+    # Construcción de la tabla Arrow
+    table = pa.table({
+        "eid": pa.array(df["EID"].astype(str).tolist()),
+        "title": pa.array(df["Title"].astype(str).tolist()),
+        "source_title": pa.array(df["Source title"].astype(str).tolist()),
+        "issn": pa.array(df["ISSN"].fillna("").astype(str).tolist()),
+        "eissn": pa.array(df["eISSN"].fillna("").astype(str).tolist()),
+        "year": pa.array(year.tolist(), type=pa.int64()),
+        "cited_by": pa.array(cited.tolist(), type=pa.int32()),
+        "doi": pa.array(df["DOI"].fillna("").astype(str).tolist()),
+        "link": pa.array(df["Link"].fillna("").astype(str).tolist()),
+        "Document Type": pa.array(df["Document Type"].astype(str).tolist()),
+        "Open Access": pa.array(df["Open Access"].astype(str).tolist()),
+        "text_for_match": pa.array(df["text_for_match"].tolist()),
+        "embedding": pa.array(embs.tolist(), type=pa.list_(pa.float32())),
+    })
+    pq.write_table(table, args.out, compression="zstd")
+    dim = len(embs[0]) if len(embs) else 0
+    print(f"OK -> {args.out} | filas: {table.num_rows} | dim: {dim}")
+if __name__ == "__main__":
+    main()

build_science_embeddings.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# build_science_embeddings.py
+import numpy as np
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm import trange
+from sentence_transformers import SentenceTransformer
+PARQUET_PATH = "scopus_corpus.parquet"
+OUT_PATH = "scopus_corpus_with_specter.parquet"
+BATCH = 64
+DEVICE = "cpu"   # pon "cuda" si tienes GPU
+# 1) Carga
+table = pq.read_table(PARQUET_PATH)
+df = table.to_pandas()
+# 2) Texto para SPECTER: "Title [SEP] Abstract"
+def row_text(row):
+    title = str(row.get("title", "") or "")
+    abstract = str(row.get("abstract", "") or "")
+    if abstract.strip():
+        return f"{title} [SEP] {abstract}"
+    return title
+texts = [row_text(r) for _, r in df.iterrows()]
+# 3) Modelo SPECTER (SentenceTransformers)
+specter = SentenceTransformer("allenai-specter", device=DEVICE)
+# 4) Encode en batches
+embs = []
+for i in trange(0, len(texts), BATCH, desc="SPECTER"):
+    batch = texts[i:i+BATCH]
+    vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False)
+    embs.append(vecs.astype("float32"))
+specter_mat = np.vstack(embs).astype("float32")
+# 5) Guardar como lista en DataFrame (compatible con tu pipeline)
+df["specter_embedding"] = [v.tolist() for v in specter_mat]
+# (Opcional) SciBERT embeddings rápidos (mean pooling)
+# Solo si los quieres además de SPECTER — si no, comenta este bloque.
+# from transformers import AutoTokenizer, AutoModel
+# import torch
+# tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
+# mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE)
+# mdl.eval()
+# sciberts = []
+# with torch.no_grad():
+#     for i in trange(0, len(texts), BATCH, desc="SciBERT"):
+#         batch = texts[i:i+BATCH]
+#         enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE)
+#         out = mdl(**enc).last_hidden_state  # [B, T, 768]
+#         mask = enc.attention_mask.unsqueeze(-1)  # [B, T, 1]
+#         summed = (out * mask).sum(dim=1)
+#         counts = mask.sum(dim=1).clamp(min=1)
+#         mean = summed / counts
+#         # normaliza L2
+#         mean = torch.nn.functional.normalize(mean, p=2, dim=1)
+#         sciberts.append(mean.cpu().numpy().astype("float32"))
+# scibert_mat = np.vstack(sciberts).astype("float32")
+# df["scibert_embedding"] = [v.tolist() for v in scibert_mat]
+# 6) Guardar Parquet
+pq.write_table(pa.Table.from_pandas(df), OUT_PATH)
+print("OK ->", OUT_PATH)

journal_recommender_app.py ADDED Viewed

	@@ -0,0 +1,548 @@

+import numpy as np
+import pandas as pd
+import pyarrow.parquet as pq
+from sentence_transformers import SentenceTransformer
+import gradio as gr
+import io, os, tempfile, base64, json
+from string import Template
+import networkx as nx
+from networkx.algorithms.community import greedy_modularity_communities
+# =========================
+# Config
+# =========================
+PARQUET_PATH = "scopus_corpus.parquet"  # usa el parquet enriquecido si ya generaste SPECTER
+MODEL_NAME_E5 = "intfloat/multilingual-e5-small"  # recuperador rápido
+MODEL_NAME_SPECTER = "allenai-specter"            # embeddings científicos
+qry_prefix = "query: "
+# =========================
+# Carga dataset
+# =========================
+table = pq.read_table(PARQUET_PATH)
+df = table.to_pandas()
+# Embeddings E5 (documentos) normalizados
+embeddings = np.vstack(df["embedding"].to_list()).astype("float32")
+# Embeddings SPECTER (documentos), si existen
+specter_embs = None
+if "specter_embedding" in df.columns:
+    specter_embs = np.vstack(df["specter_embedding"].to_list()).astype("float32")
+SPECTER_AVAILABLE = specter_embs is not None
+# =========================
+# Modelos (E5 fijo, SPECTER lazy)
+# =========================
+model_e5 = SentenceTransformer(MODEL_NAME_E5, device="cpu")
+_model_specter = None
+def get_specter():
+    global _model_specter
+    if _model_specter is None:
+        _model_specter = SentenceTransformer(MODEL_NAME_SPECTER, device="cpu")
+    return _model_specter
+# =========================
+# Recomendación (tabla)
+# =========================
+def recommend(query: str,
+              k_articles: int = 300,
+              top_n: int = 10,
+              min_year: str = "",
+              max_year: str = "",
+              use_specter: bool = False,
+              alpha_e5: float = 0.6):
+    query = (query or "").strip()
+    if len(query) < 5:
+        return pd.DataFrame({"Mensaje": ["Escribe un título o idea más descriptiva (≥ 5 caracteres)."]})
+    # Filtro de años (opcional)
+    sub_df = df
+    if min_year.strip() or max_year.strip():
+        try:
+            y0 = int(min_year) if min_year.strip() else None
+            y1 = int(max_year) if max_year.strip() else None
+        except ValueError:
+            y0 = y1 = None
+        if y0 is not None:
+            sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
+        if y1 is not None:
+            sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
+    if sub_df.empty:
+        return pd.DataFrame({"Mensaje": ["No hay artículos en el rango de años solicitado."]})
+    sub_idx = sub_df.index.to_numpy()
+    sub_e5 = embeddings[sub_idx]
+    # Embedding de la consulta
+    q_e5 = model_e5.encode([qry_prefix + query], normalize_embeddings=True)[0].astype("float32")
+    sims_e5 = sub_e5 @ q_e5
+    sims = sims_e5
+    if use_specter and specter_embs is not None:
+        # Mezcla con SPECTER
+        spc = specter_embs[sub_idx]
+        q_spc = get_specter().encode([query], normalize_embeddings=True)[0].astype("float32")
+        sims_spc = spc @ q_spc
+        alpha = float(alpha_e5)
+        sims = alpha * sims_e5 + (1 - alpha) * sims_spc
+    # Top-k artículos similares
+    k = min(int(k_articles), len(sub_idx))
+    if k <= 0:
+        return pd.DataFrame({"Mensaje": ["No hay artículos para comparar."]})
+    top_k_idx_local = np.argpartition(-sims, k - 1)[:k]
+    top_rows = sub_df.iloc[top_k_idx_local].copy()
+    top_rows["sim"] = sims[top_k_idx_local]
+    # Agregar por revista
+    grp_cols = ["source_title", "issn", "eissn"]
+    best_idx = (top_rows.groupby(grp_cols)["sim"].idxmax())
+    agg = (top_rows.groupby(grp_cols)
+           .agg(score=("sim", "mean"),
+                best=("sim", "max"),
+                n=("sim", "size"))
+           .reset_index())
+    # Extra info (si existe)
+    extra_cols = ["title", "doi", "link", "year", "Document Type", "Open Access"]
+    extra_cols_present = [c for c in extra_cols if c in top_rows.columns]
+    best_titles = top_rows.loc[best_idx, grp_cols + extra_cols_present].set_index(grp_cols)
+    agg = agg.merge(best_titles, left_on=grp_cols, right_index=True, how="left")
+    # Ranking híbrido
+    agg["rank"] = agg["score"] * 0.8 + agg["best"] * 0.2 + np.log1p(agg["n"]) * 0.02
+    out = (
+        agg.sort_values("rank", ascending=False)
+           .head(int(top_n))
+           .rename(columns={
+               "source_title": "Revista",
+               "issn": "ISSN",
+               "eissn": "eISSN",
+               "n": "#similitudes",
+               "year": "Año",
+               "score": "Score medio",
+               "best": "Mejor similitud",
+               "title": "Título del artículo",
+               "doi": "DOI",
+               "link": "Link",
+               "document type": "Document Type",
+               "open access": "Open Access"
+           })
+    )
+    if "Año" in out.columns:
+        out["Año"] = out["Año"].fillna(0).astype(int).replace(0, "")
+    cols = ["Revista","Año","ISSN","eISSN","#similitudes","Score medio","Mejor similitud",
+            "Título del artículo","DOI","Link","Document Type","Open Access"]
+    out = out[[c for c in cols if c in out.columns]]
+    if "Score medio" in out.columns:
+        out["Score medio"] = out["Score medio"].round(3)
+    if "Mejor similitud" in out.columns:
+        out["Mejor similitud"] = out["Mejor similitud"].round(3)
+    return out
+# =========================
+# Grafo interactivo (vis-network en iframe)
+# =========================
+def build_similarity_network_html(query_text: str,
+                                  k_articles: int,
+                                  min_year: str,
+                                  max_year: str,
+                                  use_specter: bool = False,
+                                  alpha_e5: float = 0.6,
+                                  top_nodes: int = 15,
+                                  doc_edge_threshold: float = 0.35) -> str:
+    qtxt = (query_text or "").strip()
+    if len(qtxt) < 5:
+        return "<p>Escribe un título/idea más descriptiva (≥ 5 caracteres).</p>"
+    # ---- Filtro por años ----
+    sub_df = df
+    if (min_year or "").strip() or (max_year or "").strip():
+        try:
+            y0 = int(min_year) if (min_year or "").strip() else None
+            y1 = int(max_year) if (max_year or "").strip() else None
+        except ValueError:
+            y0 = y1 = None
+        if y0 is not None:
+            sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
+        if y1 is not None:
+            sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
+        if sub_df.empty:
+            return "<p>No hay artículos en el rango de años solicitado.</p>"
+    sub_idx = sub_df.index.to_numpy()
+    sub_e5 = embeddings[sub_idx]
+    # ---- Similitud a consulta (para tamaño de nodos) ----
+    q_e5 = model_e5.encode([qry_prefix + qtxt], normalize_embeddings=True)[0].astype("float32")
+    scores_e5 = sub_e5 @ q_e5
+    # Híbrido (opcional)
+    ns = scores_e5
+    if use_specter and specter_embs is not None:
+        spc = specter_embs[sub_idx]
+        q_spc = get_specter().encode([qtxt], normalize_embeddings=True)[0].astype("float32")
+        scores_spc = spc @ q_spc
+        alpha = float(alpha_e5)
+        ns = alpha * scores_e5 + (1 - alpha) * scores_spc
+    # Top-k por similitud
+    k = min(int(k_articles), len(sub_idx))
+    top_idx_local = np.argpartition(-ns, k - 1)[:k]
+    top_rows = sub_df.iloc[top_idx_local].copy()
+    top_rows["sim_to_query"] = ns[top_idx_local]
+    top_rows = top_rows.sort_values("sim_to_query", ascending=False).head(int(top_nodes))
+    if len(top_rows) < 2:
+        return "<p>No hay suficientes artículos para graficar la red.</p>"
+    node_idx = top_rows.index.to_numpy()
+    node_e5 = embeddings[node_idx]
+    # ---- Aristas artículo–artículo ----
+    # E5 por defecto; si SPECTER activo y disponible, usarlo para mayor coherencia temática
+    pair_mat = node_e5
+    if use_specter and specter_embs is not None:
+        pair_mat = specter_embs[node_idx]
+    pair_sims = pair_mat @ pair_mat.T
+    # ---- Colores por año (teal gradient estilo CP) ----
+    years = top_rows["year"].fillna(0).astype(int).to_numpy()
+    y_valid = years[years > 0]
+    y_min, y_max = (int(y_valid.min()), int(y_valid.max())) if len(y_valid) else (2000, 2025)
+    def teal_year_color(y: int) -> str:
+        t = 0.0 if (not y or y <= 0 or y_max == y_min) else (y - y_min) / (y_max - y_min)
+        h = 170
+        s = int(35 + 35 * t)
+        l = int(85 - 30 * t)
+        return f"hsl({h}, {s}%, {l}%)"
+    # ---- Comunidades (clusters) para modo color=Comunidad ----
+    ids = [str(row.get("eid", idx)) for idx, row in top_rows.iterrows()]
+    Gc = nx.Graph()
+    Gc.add_nodes_from(ids)
+    n = len(ids)
+    for i in range(n):
+        for j in range(i + 1, n):
+            w = float(pair_sims[i, j])
+            if w >= float(doc_edge_threshold):
+                Gc.add_edge(ids[i], ids[j], weight=w)
+    comms = list(greedy_modularity_communities(Gc, weight="weight")) if Gc.number_of_edges() else [set(ids)]
+    node2comm = {nid: ci for ci, c in enumerate(comms) for nid in c}
+    def pastel_palette(k, s=60, l=65):
+        return [f"hsl({int(360*i/k)}, {s}%, {l}%)" for i in range(max(1, k))]
+    comm_colors = pastel_palette(len(comms))
+    group_colors = {str(i): comm_colors[i] for i in range(len(comms))}
+    # ---- Construcción nodos/aristas para vis.js ----
+    ns_nodes = top_rows["sim_to_query"].to_numpy(dtype=float)
+    smin, smax = (float(ns_nodes.min()), float(ns_nodes.max())) if ns_nodes.size else (0.0, 0.0)
+    def node_size(sim):
+        if smax <= smin: return 18
+        return 14 + 40 * (float(sim) - smin) / (smax - smin)
+    nodes, edges = [], []
+    nodes.append({
+        "id": "QUERY", "label": "Consulta", "title": qtxt,
+        "shape": "star", "size": 46, "color": "#e45756",
+        "font": {"size": 16, "strokeWidth": 6, "strokeColor": "#ffffff"}
+    })
+    for _, row in top_rows.iterrows():
+        eid = str(row.get("eid", "")) or str(row.name)
+        title = str(row.get("title", ""))[:160]
+        journal = str(row.get("source_title", ""))[:120]
+        year = int(row.get("year", 0)) if pd.notna(row.get("year", None)) else 0
+        doi  = str(row.get("doi", "")) or ""
+        link = str(row.get("link", "")) or ""
+        sim  = float(row.get("sim_to_query", 0.0))
+        label = (journal or title)[:40] or "Artículo"
+        tooltip = (
+            f"<b>{title}</b><br>"
+            f"Revista: {journal}<br>"
+            f"Año: {year if year>0 else 'N/D'}<br>"
+            f"Similitud con consulta: {sim:.3f}<br>"
+            f"DOI: {doi}<br>"
+            f"<a href='{link}' target='_blank'>Abrir</a>"
+        )
+        group = str(node2comm.get(eid, 0))
+        nodes.append({
+            "id": eid, "label": label, "title": tooltip,
+            "size": node_size(sim), "year": year, "group": group,
+            "colorYear": teal_year_color(year),
+            "font": {"size": 14, "strokeWidth": 6, "strokeColor": "#ffffff"}
+        })
+        edges.append({
+            "from": "QUERY", "to": eid,
+            "value": sim,
+            "width": 1 + 6*max(0.0, sim),
+            "color": {"color": "#9fb7b3"},
+            "smooth": True
+        })
+    for i in range(n):
+        for j in range(i + 1, n):
+            w = float(pair_sims[i, j])
+            edges.append({
+                "from": ids[i], "to": ids[j],
+                "value": w,
+                "width": max(0.8, 3.0*(w-0.2)),
+                "hidden": w < doc_edge_threshold,
+                "color": {"color": "#b9c7c5"},
+                "smooth": True
+            })
+    options = {
+        "interaction": {
+            "hover": True, "multiselect": True, "dragNodes": True,
+            "navigationButtons": False,
+            "keyboard": {"enabled": True, "bindToWindow": True}
+        },
+        "physics": {
+            "enabled": True, "solver": "forceAtlas2Based",
+            "forceAtlas2Based": {
+                "avoidOverlap": 0.4, "gravitationalConstant": -45,
+                "centralGravity": 0.015, "springLength": 135,
+                "springConstant": 0.055, "damping": 0.45
+            },
+            "stabilization": {"iterations": 140}
+        },
+        "nodes": {
+            "shape": "dot", "borderWidth": 1,
+            "shadow": {"enabled": True, "size": 8, "x": 0, "y": 1}
+        },
+        "edges": {
+            "smooth": {"type": "continuous"},
+            "selectionWidth": 2,
+            "shadow": {"enabled": True, "size": 6, "x": 0, "y": 1}
+        }
+    }
+    tmpl = Template(r"""
+<div style="font-family:system-ui,-apple-system,Segoe UI,Roboto; background:#f6f8f9; padding:8px; border-radius:8px;">
+  <div style="display:flex; gap:14px; align-items:center; margin:6px 0 10px 0;">
+    <div style="white-space:nowrap;">
+      <label><b>Color por:</b></label>
+      <label style="margin-left:6px;"><input type="radio" name="colorMode" value="year" checked> Año</label>
+      <label style="margin-left:6px;"><input type="radio" name="colorMode" value="community"> Comunidad</label>
+    </div>
+    <div style="min-width:290px;">
+      <label for="edgeSlider"><b>Umbral</b>: <span id="edgeVal">$THRESH</span></label>
+      <input id="edgeSlider" type="range" min="0" max="1" step="0.01" value="$THRESH"
+             style="width:180px; margin-left:8px;">
+    </div>
+  </div>
+  <div style="display:flex; align-items:center; gap:10px; margin:2px 0 8px 6px;">
+    <div style="width:82px; text-align:right; color:#5b6b70; font-size:12px;">Años:</div>
+    <input id="yearMin" type="range" min="$YMIN" max="$YMAX" value="$YMIN" step="1" style="flex:1;">
+    <input id="yearMax" type="range" min="$YMIN" max="$YMAX" value="$YMAX" step="1" style="flex:1;">
+    <div id="yearLbl" style="width:130px; text-align:left; color:#5b6b70; font-size:12px;">$YMIN – $YMAX</div>
+  </div>
+  <div style="height:10px; margin:0 6px 8px 90px; background:linear-gradient(90deg, hsl(170,35%,85%) 0%, hsl(170,70%,55%) 100%); border-radius:6px;"></div>
+  <div id="netContainer" style="height:720px; border:1px solid #d6e0e2; border-radius:12px; background:#fbfcfd;"></div>
+  <div style="position:relative; margin-top:6px;">
+    <div style="position:absolute; left:6px; bottom:6px; display:flex; gap:8px;">
+      <button id="btnFit" title="Ajustar vista" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⟲</button>
+      <button id="btnPNG" title="Exportar PNG" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⬇</button>
+      <button id="btnHelp" title="Ayuda" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">?</button>
+    </div>
+  </div>
+</div>
+<script src="https://unpkg.com/vis-network@9.1.9/dist/vis-network.min.js"></script>
+<script>
+(function(){
+  const nodes = new vis.DataSet($NODES);
+  const edges = new vis.DataSet($EDGES);
+  const options = $OPTIONS;
+  const groupColors = $GROUPCOLORS;
+  const container = document.getElementById('netContainer');
+  const net = new vis.Network(container, {nodes, edges}, options);
+  window.network = net; window.nodes = nodes; window.edges = edges;
+  // Color por año/comunidad
+  function applyColors(mode){
+    nodes.forEach(n=>{
+      if(n.id==='QUERY') return;
+      const col = (mode==='community') ? (groupColors[String(n.group)]||'#9fb7b3') : (n.colorYear||'#9fb7b3');
+      nodes.update({ id:n.id, color: col });
+    });
+  }
+  applyColors('year');
+  document.querySelectorAll('input[name="colorMode"]').forEach(r =>
+    r.addEventListener('change', e => applyColors(e.target.value))
+  );
+  // Umbral
+  const slider = document.getElementById('edgeSlider');
+  const edgeVal = document.getElementById('edgeVal');
+  function applyThreshold(th){
+    edges.forEach(e=>{
+      const show = (e.value||0) >= th || e.from==='QUERY' || e.to==='QUERY';
+      edges.update({ id:e.id, hidden:!show, width: show ? (e.width||1) : 0.1 });
+    });
+  }
+  slider.addEventListener('input', ()=>{
+    const th = parseFloat(slider.value||'0');
+    edgeVal.textContent = th.toFixed(2);
+    applyThreshold(th);
+  });
+  applyThreshold(parseFloat(slider.value||'0'));
+  // Timeline (doble slider)
+  const sMin=document.getElementById('yearMin'), sMax=document.getElementById('yearMax');
+  const yLbl=document.getElementById('yearLbl');
+  function applyYearFilter(a,b){
+    const lo=Math.min(a,b), hi=Math.max(a,b);
+    yLbl.textContent = lo+" – "+hi;
+    const visible=new Set();
+    nodes.forEach(n=>{
+      if(n.id==='QUERY'){ visible.add(n.id); return; }
+      const y=Number(n.year||0);
+      const show=(y===0)||(y>=lo && y<=hi);
+      nodes.update({ id:n.id, hidden:!show });
+      if(show) visible.add(n.id);
+    });
+    edges.forEach(e=>{
+      const show=visible.has(e.from)&&visible.has(e.to);
+      edges.update({ id:e.id, hidden:!show });
+    });
+  }
+  function clamp(){ let a=+sMin.value, b=+sMax.value; if(a>b) [a,b]=[b,a]; applyYearFilter(a,b); }
+  sMin.addEventListener('input',clamp); sMax.addEventListener('input',clamp); clamp();
+  // Resaltado de vecindad
+  const inactive='rgba(200,210,210,0.35)';
+  function highlight(ids){
+    const nbr=new Set(ids);
+    ids.forEach(id=> net.getConnectedNodes(id).forEach(n=>nbr.add(n)));
+    nodes.forEach(n=>{
+      const active=nbr.has(n.id)||n.id==='QUERY';
+      nodes.update({ id:n.id, color: active?(n.color||'#9fb7b3'):inactive });
+    });
+  }
+  net.on('selectNode', p=>highlight(p.nodes));
+  net.on('deselectNode', ()=>applyColors(document.querySelector('input[name="colorMode"]:checked').value));
+  // Botones
+  document.getElementById('btnFit').onclick = () => net.fit({animation: true});
+  document.getElementById('btnPNG').onclick = () => {
+    const url = net.canvas.frame.canvas.toDataURL('image/png');
+    const a = document.createElement('a'); a.href = url; a.download = 'graph.png'; a.click();
+  };
+  document.getElementById('btnHelp').onclick = () => alert(
+    "Usa: Color por Año/Comunidad • Umbral de arista • Rango de años • Clic para resaltar vecindad • Doble clic abre el enlace (tooltip)."
+  );
+  // Doble clic abre enlace/DOI si existe
+  net.on('doubleClick', (p) => {
+    if (p.nodes && p.nodes.length===1){
+      const n = nodes.get(p.nodes[0]);
+      if (n && n.title) {
+        const tmp = document.createElement('div'); tmp.innerHTML = n.title;
+        const a = tmp.querySelector('a'); if (a && a.href) window.open(a.href, '_blank');
+      }
+    }
+  });
+})();
+</script>
+""")
+    html = tmpl.substitute(
+        NODES=json.dumps(nodes),
+        EDGES=json.dumps(edges),
+        OPTIONS=json.dumps(options),
+        GROUPCOLORS=json.dumps(group_colors),
+        YMIN=y_min,
+        YMAX=y_max,
+        THRESH=f"{doc_edge_threshold:.2f}",
+    )
+    b64 = base64.b64encode(html.encode("utf-8")).decode("ascii")
+    return (
+        f'<iframe src="data:text/html;charset=utf-8;base64,{b64}" '
+        f'style="width:100%;height:820px;border:0;" '
+        f'sandbox="allow-scripts allow-same-origin allow-popups"></iframe>'
+    )
+# =========================
+# UI Gradio
+# =========================
+with gr.Blocks(title="Recomendador de Revistas (Scopus)") as demo:
+    gr.Markdown("## Recomendación de revistas UPTC")
+    # --- Entrada principal ---
+    with gr.Row():
+        query = gr.Textbox(
+            label="Título o idea de investigación",
+            lines=3,
+            placeholder="Ej.: Detección temprana de fallas en motores usando aprendizaje profundo…"
+        )
+    # --- Filtros de año ---
+    with gr.Row():
+        min_year = gr.Textbox(label="Año mínimo (opcional)", placeholder="2019")
+        max_year = gr.Textbox(label="Año máximo (opcional)", placeholder="2025")
+    # --- Top-k y nº de revistas ---
+    with gr.Row():
+        k_articles = gr.Slider(50, 1000, value=300, step=50, label="Artículos considerados (top-k)")
+        top_n = gr.Slider(5, 20, value=10, step=1, label="Nº de revistas a mostrar")
+    # --- Fusionar con SPECTER ---
+    with gr.Row():
+        use_specter = gr.Checkbox(
+            label="Fusionar con SPECTER (mejor afinidad científica)",
+            value=SPECTER_AVAILABLE
+        )
+        alpha_e5 = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Peso E5  (1−α = SPECTER)")
+    # --- BOTONES: SIEMPRE DEBAJO DE FUSIÓN ---
+    with gr.Row():
+        btn = gr.Button("Recomendar")
+        btn_net = gr.Button("Ver red de similitud")
+    # --- SALIDAS (van DESPUÉS, así los botones quedan fijos arriba de ellas) ---
+    out = gr.Dataframe(
+        row_count=10, wrap=True,
+        column_widths=[180, 60, 90, 90, 90, 90, 90, 250, 120, 120, 120, 100],
+        label="Revistas recomendadas"
+    )
+    out_net_html = gr.HTML(label="Grafo interactivo (explorable)")
+    # --- Acciones (pueden declararse después de crear 'out' y 'out_net_html') ---
+    btn.click(
+        fn=recommend,
+        inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
+        outputs=out
+    )
+    query.submit(
+        fn=recommend,
+        inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
+        outputs=out
+    )
+    btn_net.click(
+        fn=lambda q, ka, y0, y1, us, a: build_similarity_network_html(
+            q, ka, y0, y1, use_specter=us, alpha_e5=a, top_nodes=15, doc_edge_threshold=0.35
+        ),
+        inputs=[query, k_articles, min_year, max_year, use_specter, alpha_e5],
+        outputs=[out_net_html]
+    )
+if __name__ == "__main__":
+    demo.launch()

scopus.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# scopus_simple_extract.py
+# Extrae resultados Scopus por AF-ID y exporta UN CSV "amigable" con campos básicos.
+# NO pide Abstract, Autores, Keywords, Funding, Conference, etc.
+import time
+import argparse
+import urllib.parse as urlparse
+from typing import Dict, List, Optional
+import requests
+import numpy as np
+import pandas as pd
+BASE_URL_SEARCH = "https://api.elsevier.com/content/search/scopus"
+# -------------------------
+# HTTP utilidades
+# -------------------------
+def build_headers(api_key: str, insttoken: Optional[str] = None) -> Dict[str, str]:
+    h = {"Accept": "application/json", "X-ELS-APIKey": api_key.strip()}
+    if insttoken:
+        h["X-ELS-Insttoken"] = insttoken.strip()
+    return h
+def get_json(session: requests.Session, url: str, params: Dict[str, str],
+             headers: Dict[str, str], max_retries: int = 6, sleep_base: float = 0.75) -> Dict:
+    """
+    GET con reintentos para 429/5xx. Si 401 por Insttoken mal pareado, reintenta SIN Insttoken.
+    """
+    last_exc = None
+    tried_without_token = False
+    for t in range(max_retries + 1):
+        try:
+            r = session.get(url, params=params, headers=headers, timeout=90)
+        except Exception as ex:
+            last_exc = ex
+            time.sleep((2 ** t) * sleep_base)
+            continue
+        if r.status_code in (429, 500, 502, 503, 504):
+            time.sleep((2 ** t) * sleep_base)
+            continue
+        if r.status_code == 401:
+            # intentar una sola vez sin Insttoken si el problema es token no asociado
+            try:
+                j = r.json()
+            except Exception:
+                j = {}
+            if ("Institution Token is not associated with API Key" in str(j)
+                and not tried_without_token
+                and "X-ELS-Insttoken" in headers):
+                tried_without_token = True
+                h2 = dict(headers)
+                h2.pop("X-ELS-Insttoken", None)
+                r2 = session.get(url, params=params, headers=h2, timeout=90)
+                if r2.ok:
+                    try:
+                        return r2.json()
+                    except Exception:
+                        raise RuntimeError("La respuesta no es JSON decodificable.")
+                else:
+                    try:
+                        j2 = r2.json()
+                    except Exception:
+                        j2 = {}
+                    raise RuntimeError(f"HTTP {r2.status_code} – {j2 or r2.text}")
+        if not r.ok:
+            try:
+                j = r.json()
+            except Exception:
+                j = {}
+            raise RuntimeError(f"HTTP {r.status_code} – {j or r.text}")
+        try:
+            return r.json()
+        except Exception:
+            raise RuntimeError("La respuesta no es JSON decodificable.")
+    if last_exc:
+        raise RuntimeError(f"Error de red persistente: {last_exc}")
+    raise RuntimeError("No se obtuvo respuesta estable tras varios reintentos.")
+# -------------------------
+# Paginación Search API
+# -------------------------
+def extract_by_year_cursor(session: requests.Session, headers: Dict[str, str],
+                           afid: str, year: int, page_size: int, view: str) -> List[Dict]:
+    params = {
+        "query": f"AF-ID({afid}) AND PUBYEAR = {year}",
+        "view": view,
+        "count": str(page_size),
+        "cursor": "*",
+    }
+    entries: List[Dict] = []
+    while True:
+        j = get_json(session, BASE_URL_SEARCH, params, headers)
+        chunk = j.get("search-results", {}).get("entry", []) or []
+        if chunk:
+            entries.extend(chunk)
+        next_token = None
+        for ln in j.get("search-results", {}).get("link", []) or []:
+            if ln.get("@ref") == "next":
+                href = ln.get("@href")
+                if href:
+                    q = urlparse.urlparse(href).query
+                    qd = urlparse.parse_qs(q)
+                    next_token = (qd.get("cursor") or [None])[0]
+                break
+        if not next_token:
+            break
+        params["cursor"] = next_token
+    return entries
+def extract_by_year_startcount(session: requests.Session, headers: Dict[str, str],
+                               afid: str, year: int, page_size: int, view: str,
+                               hard_limit: int = 20000) -> List[Dict]:
+    entries: List[Dict] = []
+    start = 0
+    while start < hard_limit:
+        params = {
+            "query": f"AF-ID({afid}) AND PUBYEAR = {year}",
+            "view": view,
+            "count": str(page_size),
+            "start": str(start),
+        }
+        j = get_json(session, BASE_URL_SEARCH, params, headers)
+        chunk = j.get("search-results", {}).get("entry", []) or []
+        if not chunk:
+            break
+        entries.extend(chunk)
+        if len(chunk) < page_size:
+            break
+        start += page_size
+    return entries
+def extract_no_year(session: requests.Session, headers: Dict[str, str],
+                    afid: str, page_size: int, view: str, use_cursor: bool) -> List[Dict]:
+    entries: List[Dict] = []
+    if use_cursor:
+        params = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "cursor": "*"}
+        while True:
+            j = get_json(session, BASE_URL_SEARCH, params, headers)
+            chunk = j.get("search-results", {}).get("entry", []) or []
+            if chunk:
+                entries.extend(chunk)
+            next_token = None
+            for ln in j.get("search-results", {}).get("link", []) or []:
+                if ln.get("@ref") == "next":
+                    href = ln.get("@href")
+                    if href:
+                        q = urlparse.urlparse(href).query
+                        qd = urlparse.parse_qs(q)
+                        next_token = (qd.get("cursor") or [None])[0]
+                    break
+            if not next_token:
+                break
+            params["cursor"] = next_token
+    else:
+        start = 0
+        while True:
+            params_sc = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "start": str(start)}
+            j = get_json(session, BASE_URL_SEARCH, params_sc, headers)
+            chunk = j.get("search-results", {}).get("entry", []) or []
+            if not chunk:
+                break
+            entries.extend(chunk)
+            if len(chunk) < page_size:
+                break
+            start += page_size
+    return entries
+# -------------------------
+# Normalización básica (sin autores/abstract/keywords/funding/etc.)
+# -------------------------
+TOP_FIELD_MAP = {
+    "dc:title": "title",
+    # NO pedimos abstract ni keywords
+    "prism:coverDate": "coverDate",
+    "prism:doi": "doi",
+    "prism:publicationName": "sourceTitle",
+    "prism:issn": "issn",
+    "prism:eIssn": "eIssn",
+    "prism:volume": "volume",
+    "prism:issueIdentifier": "issue",
+    "prism:pageRange": "pages",
+    "citedby-count": "citedBy",
+    "subtype": "subtype",
+    "subtypeDescription": "subtypeDesc",
+    "openaccessFlag": "openAccess",
+    "dc:identifier": "identifier",
+    "eid": "eid",
+    "prism:url": "prismUrl",
+}
+def links_to_dict(links: List[Dict]) -> Dict[str, str]:
+    d = {}
+    for ln in links or []:
+        ref = ln.get("@ref")
+        href = ln.get("@href")
+        if ref and href:
+            d[f"link_{ref}"] = href
+    return d
+def normalize_entries(entries: List[Dict]) -> pd.DataFrame:
+    rows: List[Dict] = []
+    for e in entries:
+        row = {}
+        for k_src, k_dst in TOP_FIELD_MAP.items():
+            if k_src in e:
+                row[k_dst] = e.get(k_src)
+        row.update(links_to_dict(e.get("link")))
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    if not df.empty:
+        if "coverDate" in df.columns:
+            df["coverDate"] = pd.to_datetime(df["coverDate"], errors="coerce")
+        subset_cols = [c for c in ["eid", "identifier"] if c in df.columns]
+        if subset_cols:
+            df = df.drop_duplicates(subset=subset_cols, keep="first")
+    return df
+# -------------------------
+# Fallbacks de vista/paginación
+# -------------------------
+def try_extract_year(session, headers, afid, year, page_size, view, use_cursor) -> List[Dict]:
+    def do_extract(ps, cur, v):
+        if cur:
+            return extract_by_year_cursor(session, headers, afid, year, ps, v)
+        else:
+            return extract_by_year_startcount(session, headers, afid, year, ps, v)
+    try:
+        return do_extract(page_size, use_cursor, view)
+    except RuntimeError as e:
+        msg = str(e)
+        if "AUTHORIZATION_ERROR" in msg:
+            fallback = "STANDARD" if view == "COMPLETE" else ("BASIC" if view == "STANDARD" else None)
+            if fallback:
+                return do_extract(page_size, use_cursor, fallback)
+            raise
+        if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
+            # reduce page size y quita cursor
+            return do_extract(25, False, view)
+        if use_cursor:
+            return do_extract(page_size, False, view)
+        raise
+def try_extract_no_year(session, headers, afid, page_size, view, use_cursor) -> List[Dict]:
+    try:
+        return extract_no_year(session, headers, afid, page_size, view, use_cursor)
+    except RuntimeError as e:
+        msg = str(e)
+        if "AUTHORIZATION_ERROR" in msg:
+            if view == "COMPLETE":  # bajar a STANDARD/BASIC
+                return extract_no_year(session, headers, afid, page_size, "STANDARD", use_cursor)
+            if view == "STANDARD":
+                return extract_no_year(session, headers, afid, page_size, "BASIC", use_cursor)
+            raise
+        if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
+            return extract_no_year(session, headers, afid, 25, view, False)
+        if use_cursor:
+            return extract_no_year(session, headers, afid, page_size, view, False)
+        raise
+def fetch_scopus_affiliation(api_key: str,
+                             afid: str = "60077378",
+                             year_start: Optional[int] = 2020,
+                             year_end: Optional[int] = 2024,
+                             view: str = "STANDARD",
+                             page_size: int = 100,
+                             insttoken: Optional[str] = None,
+                             use_cursor: bool = True) -> List[Dict]:
+    headers = build_headers(api_key, insttoken)
+    session = requests.Session()
+    if year_start is None or year_end is None:
+        return try_extract_no_year(session, headers, afid, page_size, view, use_cursor)
+    entries: List[Dict] = []
+    for yr in range(int(year_start), int(year_end) + 1):
+        entries.extend(try_extract_year(session, headers, afid, yr, page_size, view, use_cursor))
+    return entries
+# -------------------------
+# Export UN SOLO CSV (ligero)
+# -------------------------
+EXPORT_COLUMNS = [
+    "Title","Year","Source title","Volume","Issue",
+    "Page start","Page end","Page count",
+    "Cited by","DOI","Link","ISSN","eISSN","Document Type","Open Access","EID"
+]
+def _pick_link(row: pd.Series) -> str:
+    for c in ("prismUrl","link_scopus","prism:url","link_self"):
+        if c in row and pd.notna(row[c]) and str(row[c]).strip():
+            return str(row[c])
+    return ""
+def pick_col(df: pd.DataFrame, primary: str, secondary: str, default: str = "") -> pd.Series:
+    """Fallback por fila: usa primary; si está vacío/NaN, toma secondary."""
+    n = len(df)
+    s1 = df[primary] if primary in df.columns else pd.Series([np.nan] * n, index=df.index)
+    s2 = df[secondary] if secondary in df.columns else pd.Series([default] * n, index=df.index)
+    s1 = s1.copy()
+    mask = s1.isna() | (s1.astype(str).str.strip() == "")
+    s1.loc[mask] = s2.loc[mask]
+    return s1.fillna(default)
+def make_export(df: pd.DataFrame) -> pd.DataFrame:
+    # Derivar Year y páginas
+    out = df.copy()
+    if "coverDate" in out.columns:
+        out["Year"] = pd.to_datetime(out["coverDate"], errors="coerce").dt.year
+    else:
+        out["Year"] = ""
+    out["Page start"], out["Page end"], out["Page count"] = "", "", ""
+    if "pages" in out.columns:
+        starts, ends, counts = [], [], []
+        for x in out["pages"].fillna(""):
+            if "-" in x:
+                a, b = x.split("-", 1)
+                a_num = "".join(ch for ch in a if ch.isdigit())
+                b_num = "".join(ch for ch in b if ch.isdigit())
+                starts.append(a_num); ends.append(b_num)
+                try:
+                    counts.append(str(max(0, int(b_num) - int(a_num) + 1)) if a_num and b_num else "")
+                except Exception:
+                    counts.append("")
+            else:
+                starts.append(""); ends.append(""); counts.append("")
+        out["Page start"], out["Page end"], out["Page count"] = starts, ends, counts
+    # Link preferido
+    out["Link"] = out.apply(_pick_link, axis=1)
+    # Ensamblar columnas finales (usando pick_col para evitar 'Series' ambiguas)
+    final = pd.DataFrame()
+    final["Title"] = out.get("title", "")
+    final["Year"]  = out.get("Year", "")
+    final["Source title"] = pick_col(out, "sourceTitle", "prism:publicationName")
+    final["Volume"]       = pick_col(out, "volume", "prism:volume")
+    final["Issue"]        = pick_col(out, "issue", "prism:issueIdentifier")
+    final["Page start"] = out["Page start"]
+    final["Page end"]   = out["Page end"]
+    final["Page count"] = out["Page count"]
+    final["Cited by"] = pick_col(out, "citedBy", "citedby-count")
+    final["DOI"]      = pick_col(out, "doi", "prism:doi")
+    final["Link"]     = out["Link"]
+    final["ISSN"]  = pick_col(out, "issn", "prism:issn")
+    final["eISSN"] = pick_col(out, "eIssn", "prism:eIssn")
+    final["Document Type"] = pick_col(out, "subtypeDesc", "subtypeDescription")
+    final["Open Access"]   = pick_col(out, "openAccess", "openaccessFlag")
+    final["EID"] = out.get("eid", "")
+    # Ordenar por año descendente (coaccionando a numérico para evitar mezclas str/int)
+    final["Year"] = pd.to_numeric(final["Year"], errors="coerce")
+    final = final.sort_values(by="Year", ascending=False, na_position="last")
+    # Reordenar/filtrar columnas
+    return final[EXPORT_COLUMNS]
+# -------------------------
+# CLI
+# -------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="Extrae publicaciones Scopus por AF-ID y exporta UN CSV básico (sin autores/abstract/etc.).")
+    p.add_argument("--api-key", required=True, help="X-ELS-APIKey")
+    p.add_argument("--insttoken", default=None, help="X-ELS-Insttoken (opcional)")
+    p.add_argument("--afid", default="60077378", help="Scopus Affiliation ID (AF-ID)")
+    p.add_argument("--year-start", default=2020, help="Año inicial o 'None'")
+    p.add_argument("--year-end", default=2024, help="Año final o 'None'")
+    p.add_argument("--view", default="STANDARD", choices=["BASIC", "STANDARD", "COMPLETE"], help="Vista del Search API")
+    p.add_argument("--page-size", type=int, default=100, help="Tamaño de página (25..200)")
+    p.add_argument("--use-cursor", action="store_true", help="Usar cursor pagination")
+    p.add_argument("--no-cursor", dest="use_cursor", action="store_false", help="Usar start/count")
+    p.set_defaults(use_cursor=True)
+    p.add_argument("--out-prefix", default="scopus_afid", help="Prefijo de salida")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    def norm_year(x):
+        sx = str(x).strip().lower()
+        return None if sx == "none" else int(x)
+    y0 = norm_year(args.year_start)
+    y1 = norm_year(args.year_end)
+    print("Descargando desde Scopus (Search API)…")
+    entries = fetch_scopus_affiliation(
+        api_key=args.api_key,
+        afid=args.afid,
+        year_start=y0,
+        year_end=y1,
+        view=args.view,
+        page_size=args.page_size,
+        insttoken=args.insttoken,
+        use_cursor=args.use_cursor
+    )
+    print(f"Entradas obtenidas: {len(entries)}")
+    df = normalize_entries(entries)
+    export_df = make_export(df)
+    out_csv = f"{args.out_prefix}_scopus_export.csv"
+    export_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
+    print(f"Listo: {out_csv}")
+if __name__ == "__main__":
+    main()

scopus_corpus.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:149ba9d029b6969dedcab02f95d6c7c77897fb7470581a3e437d545ea3af2530
+size 1443176

scopus_corpus_with_specter.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7442d3f19ba7f2d7685506abf8030951ed1486628b5960ff21eea009f9b533c9
+size 4668034