Romanes commited on
Commit
40fe9ab
·
verified ·
1 Parent(s): 3ab07d0

Upload 6 files

Browse files
build_parquet_embeddings.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pyarrow as pa
5
+ import pyarrow.parquet as pq
6
+ from sentence_transformers import SentenceTransformer
7
+ from tqdm import tqdm
8
+
9
+
10
+ def ensure_col(df: pd.DataFrame, name: str):
11
+ if name not in df.columns:
12
+ df[name] = ""
13
+ return df
14
+
15
+ def main():
16
+ p = argparse.ArgumentParser(description="Construye un Parquet con embeddings E5 para recomendación de revistas.")
17
+ p.add_argument("--csv", required=True, help="Ruta al CSV exportado (ej. uptc_afid60077378_scopus_export.csv)")
18
+ p.add_argument("--out", default="scopus_corpus.parquet", help="Ruta de salida Parquet")
19
+ p.add_argument("--model", default="intfloat/multilingual-e5-small", help="Modelo Sentence-Transformers")
20
+ p.add_argument("--batch-size", type=int, default=64, help="Tamaño de batch para el encode")
21
+ args = p.parse_args()
22
+
23
+ df = pd.read_csv(args.csv)
24
+
25
+ # Asegurar columnas mínimas del export "simple"
26
+ for c in ["Title","Source title","ISSN","eISSN","Year","Cited by","DOI","Link","EID","Document Type","Open Access"]:
27
+ ensure_col(df, c)
28
+
29
+ # Texto para similitud: funciona aunque no haya Abstract/Keywords
30
+ # Usamos título + (revista como contexto suave)
31
+ df["text_for_match"] = (
32
+ df["Title"].fillna("").astype(str).str.strip()
33
+ + ". Revista: "
34
+ + df["Source title"].fillna("").astype(str).str.strip()
35
+ ).str.replace(r"\s+", " ", regex=True).str.strip()
36
+
37
+ # Cargar modelo
38
+ print(f"Cargando modelo: {args.model}")
39
+ model = SentenceTransformer(args.model, device="cpu")
40
+
41
+ # Prefijo E5: "passage: " para el corpus
42
+ texts = ["passage: " + t if t else "passage: " for t in df["text_for_match"].tolist()]
43
+ print(f"Codificando {len(texts)} textos…")
44
+ embs = model.encode(
45
+ texts,
46
+ batch_size=args.batch_size,
47
+ show_progress_bar=True,
48
+ normalize_embeddings=True, # importantes para coseno con producto punto
49
+ ).astype(np.float32)
50
+
51
+ # Normalizaciones de tipos
52
+ year = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
53
+ cited = pd.to_numeric(df["Cited by"], errors="coerce").fillna(0).astype(np.int32)
54
+
55
+ # Construcción de la tabla Arrow
56
+ table = pa.table({
57
+ "eid": pa.array(df["EID"].astype(str).tolist()),
58
+ "title": pa.array(df["Title"].astype(str).tolist()),
59
+ "source_title": pa.array(df["Source title"].astype(str).tolist()),
60
+ "issn": pa.array(df["ISSN"].fillna("").astype(str).tolist()),
61
+ "eissn": pa.array(df["eISSN"].fillna("").astype(str).tolist()),
62
+ "year": pa.array(year.tolist(), type=pa.int64()),
63
+ "cited_by": pa.array(cited.tolist(), type=pa.int32()),
64
+ "doi": pa.array(df["DOI"].fillna("").astype(str).tolist()),
65
+ "link": pa.array(df["Link"].fillna("").astype(str).tolist()),
66
+ "Document Type": pa.array(df["Document Type"].astype(str).tolist()),
67
+ "Open Access": pa.array(df["Open Access"].astype(str).tolist()),
68
+ "text_for_match": pa.array(df["text_for_match"].tolist()),
69
+ "embedding": pa.array(embs.tolist(), type=pa.list_(pa.float32())),
70
+ })
71
+
72
+ pq.write_table(table, args.out, compression="zstd")
73
+ dim = len(embs[0]) if len(embs) else 0
74
+ print(f"OK -> {args.out} | filas: {table.num_rows} | dim: {dim}")
75
+
76
+ if __name__ == "__main__":
77
+ main()
build_science_embeddings.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_science_embeddings.py
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pyarrow.parquet as pq
5
+ import pyarrow as pa
6
+ import pyarrow.parquet as pq
7
+ from tqdm import trange
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ PARQUET_PATH = "scopus_corpus.parquet"
11
+ OUT_PATH = "scopus_corpus_with_specter.parquet"
12
+ BATCH = 64
13
+ DEVICE = "cpu" # pon "cuda" si tienes GPU
14
+
15
+ # 1) Carga
16
+ table = pq.read_table(PARQUET_PATH)
17
+ df = table.to_pandas()
18
+
19
+ # 2) Texto para SPECTER: "Title [SEP] Abstract"
20
+ def row_text(row):
21
+ title = str(row.get("title", "") or "")
22
+ abstract = str(row.get("abstract", "") or "")
23
+ if abstract.strip():
24
+ return f"{title} [SEP] {abstract}"
25
+ return title
26
+
27
+ texts = [row_text(r) for _, r in df.iterrows()]
28
+
29
+ # 3) Modelo SPECTER (SentenceTransformers)
30
+ specter = SentenceTransformer("allenai-specter", device=DEVICE)
31
+
32
+ # 4) Encode en batches
33
+ embs = []
34
+ for i in trange(0, len(texts), BATCH, desc="SPECTER"):
35
+ batch = texts[i:i+BATCH]
36
+ vecs = specter.encode(batch, normalize_embeddings=True, show_progress_bar=False)
37
+ embs.append(vecs.astype("float32"))
38
+ specter_mat = np.vstack(embs).astype("float32")
39
+
40
+ # 5) Guardar como lista en DataFrame (compatible con tu pipeline)
41
+ df["specter_embedding"] = [v.tolist() for v in specter_mat]
42
+
43
+ # (Opcional) SciBERT embeddings rápidos (mean pooling)
44
+ # Solo si los quieres además de SPECTER — si no, comenta este bloque.
45
+ # from transformers import AutoTokenizer, AutoModel
46
+ # import torch
47
+ # tok = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
48
+ # mdl = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(DEVICE)
49
+ # mdl.eval()
50
+ # sciberts = []
51
+ # with torch.no_grad():
52
+ # for i in trange(0, len(texts), BATCH, desc="SciBERT"):
53
+ # batch = texts[i:i+BATCH]
54
+ # enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(DEVICE)
55
+ # out = mdl(**enc).last_hidden_state # [B, T, 768]
56
+ # mask = enc.attention_mask.unsqueeze(-1) # [B, T, 1]
57
+ # summed = (out * mask).sum(dim=1)
58
+ # counts = mask.sum(dim=1).clamp(min=1)
59
+ # mean = summed / counts
60
+ # # normaliza L2
61
+ # mean = torch.nn.functional.normalize(mean, p=2, dim=1)
62
+ # sciberts.append(mean.cpu().numpy().astype("float32"))
63
+ # scibert_mat = np.vstack(sciberts).astype("float32")
64
+ # df["scibert_embedding"] = [v.tolist() for v in scibert_mat]
65
+
66
+ # 6) Guardar Parquet
67
+ pq.write_table(pa.Table.from_pandas(df), OUT_PATH)
68
+ print("OK ->", OUT_PATH)
journal_recommender_app.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pyarrow.parquet as pq
4
+ from sentence_transformers import SentenceTransformer
5
+ import gradio as gr
6
+ import io, os, tempfile, base64, json
7
+ from string import Template
8
+ import networkx as nx
9
+ from networkx.algorithms.community import greedy_modularity_communities
10
+
11
+ # =========================
12
+ # Config
13
+ # =========================
14
+ PARQUET_PATH = "scopus_corpus.parquet" # usa el parquet enriquecido si ya generaste SPECTER
15
+ MODEL_NAME_E5 = "intfloat/multilingual-e5-small" # recuperador rápido
16
+ MODEL_NAME_SPECTER = "allenai-specter" # embeddings científicos
17
+ qry_prefix = "query: "
18
+
19
+ # =========================
20
+ # Carga dataset
21
+ # =========================
22
+ table = pq.read_table(PARQUET_PATH)
23
+ df = table.to_pandas()
24
+
25
+ # Embeddings E5 (documentos) normalizados
26
+ embeddings = np.vstack(df["embedding"].to_list()).astype("float32")
27
+
28
+ # Embeddings SPECTER (documentos), si existen
29
+ specter_embs = None
30
+ if "specter_embedding" in df.columns:
31
+ specter_embs = np.vstack(df["specter_embedding"].to_list()).astype("float32")
32
+ SPECTER_AVAILABLE = specter_embs is not None
33
+
34
+ # =========================
35
+ # Modelos (E5 fijo, SPECTER lazy)
36
+ # =========================
37
+ model_e5 = SentenceTransformer(MODEL_NAME_E5, device="cpu")
38
+ _model_specter = None
39
+
40
+ def get_specter():
41
+ global _model_specter
42
+ if _model_specter is None:
43
+ _model_specter = SentenceTransformer(MODEL_NAME_SPECTER, device="cpu")
44
+ return _model_specter
45
+
46
+ # =========================
47
+ # Recomendación (tabla)
48
+ # =========================
49
+ def recommend(query: str,
50
+ k_articles: int = 300,
51
+ top_n: int = 10,
52
+ min_year: str = "",
53
+ max_year: str = "",
54
+ use_specter: bool = False,
55
+ alpha_e5: float = 0.6):
56
+
57
+ query = (query or "").strip()
58
+ if len(query) < 5:
59
+ return pd.DataFrame({"Mensaje": ["Escribe un título o idea más descriptiva (≥ 5 caracteres)."]})
60
+
61
+ # Filtro de años (opcional)
62
+ sub_df = df
63
+ if min_year.strip() or max_year.strip():
64
+ try:
65
+ y0 = int(min_year) if min_year.strip() else None
66
+ y1 = int(max_year) if max_year.strip() else None
67
+ except ValueError:
68
+ y0 = y1 = None
69
+ if y0 is not None:
70
+ sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
71
+ if y1 is not None:
72
+ sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
73
+ if sub_df.empty:
74
+ return pd.DataFrame({"Mensaje": ["No hay artículos en el rango de años solicitado."]})
75
+
76
+ sub_idx = sub_df.index.to_numpy()
77
+ sub_e5 = embeddings[sub_idx]
78
+
79
+ # Embedding de la consulta
80
+ q_e5 = model_e5.encode([qry_prefix + query], normalize_embeddings=True)[0].astype("float32")
81
+ sims_e5 = sub_e5 @ q_e5
82
+
83
+ sims = sims_e5
84
+ if use_specter and specter_embs is not None:
85
+ # Mezcla con SPECTER
86
+ spc = specter_embs[sub_idx]
87
+ q_spc = get_specter().encode([query], normalize_embeddings=True)[0].astype("float32")
88
+ sims_spc = spc @ q_spc
89
+ alpha = float(alpha_e5)
90
+ sims = alpha * sims_e5 + (1 - alpha) * sims_spc
91
+
92
+ # Top-k artículos similares
93
+ k = min(int(k_articles), len(sub_idx))
94
+ if k <= 0:
95
+ return pd.DataFrame({"Mensaje": ["No hay artículos para comparar."]})
96
+
97
+ top_k_idx_local = np.argpartition(-sims, k - 1)[:k]
98
+ top_rows = sub_df.iloc[top_k_idx_local].copy()
99
+ top_rows["sim"] = sims[top_k_idx_local]
100
+
101
+ # Agregar por revista
102
+ grp_cols = ["source_title", "issn", "eissn"]
103
+ best_idx = (top_rows.groupby(grp_cols)["sim"].idxmax())
104
+
105
+ agg = (top_rows.groupby(grp_cols)
106
+ .agg(score=("sim", "mean"),
107
+ best=("sim", "max"),
108
+ n=("sim", "size"))
109
+ .reset_index())
110
+
111
+ # Extra info (si existe)
112
+ extra_cols = ["title", "doi", "link", "year", "Document Type", "Open Access"]
113
+ extra_cols_present = [c for c in extra_cols if c in top_rows.columns]
114
+ best_titles = top_rows.loc[best_idx, grp_cols + extra_cols_present].set_index(grp_cols)
115
+ agg = agg.merge(best_titles, left_on=grp_cols, right_index=True, how="left")
116
+
117
+ # Ranking híbrido
118
+ agg["rank"] = agg["score"] * 0.8 + agg["best"] * 0.2 + np.log1p(agg["n"]) * 0.02
119
+
120
+ out = (
121
+ agg.sort_values("rank", ascending=False)
122
+ .head(int(top_n))
123
+ .rename(columns={
124
+ "source_title": "Revista",
125
+ "issn": "ISSN",
126
+ "eissn": "eISSN",
127
+ "n": "#similitudes",
128
+ "year": "Año",
129
+ "score": "Score medio",
130
+ "best": "Mejor similitud",
131
+ "title": "Título del artículo",
132
+ "doi": "DOI",
133
+ "link": "Link",
134
+ "document type": "Document Type",
135
+ "open access": "Open Access"
136
+ })
137
+ )
138
+ if "Año" in out.columns:
139
+ out["Año"] = out["Año"].fillna(0).astype(int).replace(0, "")
140
+ cols = ["Revista","Año","ISSN","eISSN","#similitudes","Score medio","Mejor similitud",
141
+ "Título del artículo","DOI","Link","Document Type","Open Access"]
142
+ out = out[[c for c in cols if c in out.columns]]
143
+ if "Score medio" in out.columns:
144
+ out["Score medio"] = out["Score medio"].round(3)
145
+ if "Mejor similitud" in out.columns:
146
+ out["Mejor similitud"] = out["Mejor similitud"].round(3)
147
+ return out
148
+
149
+ # =========================
150
+ # Grafo interactivo (vis-network en iframe)
151
+ # =========================
152
+ def build_similarity_network_html(query_text: str,
153
+ k_articles: int,
154
+ min_year: str,
155
+ max_year: str,
156
+ use_specter: bool = False,
157
+ alpha_e5: float = 0.6,
158
+ top_nodes: int = 15,
159
+ doc_edge_threshold: float = 0.35) -> str:
160
+
161
+ qtxt = (query_text or "").strip()
162
+ if len(qtxt) < 5:
163
+ return "<p>Escribe un título/idea más descriptiva (≥ 5 caracteres).</p>"
164
+
165
+ # ---- Filtro por años ----
166
+ sub_df = df
167
+ if (min_year or "").strip() or (max_year or "").strip():
168
+ try:
169
+ y0 = int(min_year) if (min_year or "").strip() else None
170
+ y1 = int(max_year) if (max_year or "").strip() else None
171
+ except ValueError:
172
+ y0 = y1 = None
173
+ if y0 is not None:
174
+ sub_df = sub_df[sub_df["year"].fillna(-1) >= y0]
175
+ if y1 is not None:
176
+ sub_df = sub_df[sub_df["year"].fillna(99999) <= y1]
177
+ if sub_df.empty:
178
+ return "<p>No hay artículos en el rango de años solicitado.</p>"
179
+
180
+ sub_idx = sub_df.index.to_numpy()
181
+ sub_e5 = embeddings[sub_idx]
182
+
183
+ # ---- Similitud a consulta (para tamaño de nodos) ----
184
+ q_e5 = model_e5.encode([qry_prefix + qtxt], normalize_embeddings=True)[0].astype("float32")
185
+ scores_e5 = sub_e5 @ q_e5
186
+
187
+ # Híbrido (opcional)
188
+ ns = scores_e5
189
+ if use_specter and specter_embs is not None:
190
+ spc = specter_embs[sub_idx]
191
+ q_spc = get_specter().encode([qtxt], normalize_embeddings=True)[0].astype("float32")
192
+ scores_spc = spc @ q_spc
193
+ alpha = float(alpha_e5)
194
+ ns = alpha * scores_e5 + (1 - alpha) * scores_spc
195
+
196
+ # Top-k por similitud
197
+ k = min(int(k_articles), len(sub_idx))
198
+ top_idx_local = np.argpartition(-ns, k - 1)[:k]
199
+ top_rows = sub_df.iloc[top_idx_local].copy()
200
+ top_rows["sim_to_query"] = ns[top_idx_local]
201
+ top_rows = top_rows.sort_values("sim_to_query", ascending=False).head(int(top_nodes))
202
+ if len(top_rows) < 2:
203
+ return "<p>No hay suficientes artículos para graficar la red.</p>"
204
+
205
+ node_idx = top_rows.index.to_numpy()
206
+ node_e5 = embeddings[node_idx]
207
+
208
+ # ---- Aristas artículo–artículo ----
209
+ # E5 por defecto; si SPECTER activo y disponible, usarlo para mayor coherencia temática
210
+ pair_mat = node_e5
211
+ if use_specter and specter_embs is not None:
212
+ pair_mat = specter_embs[node_idx]
213
+ pair_sims = pair_mat @ pair_mat.T
214
+
215
+ # ---- Colores por año (teal gradient estilo CP) ----
216
+ years = top_rows["year"].fillna(0).astype(int).to_numpy()
217
+ y_valid = years[years > 0]
218
+ y_min, y_max = (int(y_valid.min()), int(y_valid.max())) if len(y_valid) else (2000, 2025)
219
+
220
+ def teal_year_color(y: int) -> str:
221
+ t = 0.0 if (not y or y <= 0 or y_max == y_min) else (y - y_min) / (y_max - y_min)
222
+ h = 170
223
+ s = int(35 + 35 * t)
224
+ l = int(85 - 30 * t)
225
+ return f"hsl({h}, {s}%, {l}%)"
226
+
227
+ # ---- Comunidades (clusters) para modo color=Comunidad ----
228
+ ids = [str(row.get("eid", idx)) for idx, row in top_rows.iterrows()]
229
+ Gc = nx.Graph()
230
+ Gc.add_nodes_from(ids)
231
+ n = len(ids)
232
+ for i in range(n):
233
+ for j in range(i + 1, n):
234
+ w = float(pair_sims[i, j])
235
+ if w >= float(doc_edge_threshold):
236
+ Gc.add_edge(ids[i], ids[j], weight=w)
237
+
238
+ comms = list(greedy_modularity_communities(Gc, weight="weight")) if Gc.number_of_edges() else [set(ids)]
239
+ node2comm = {nid: ci for ci, c in enumerate(comms) for nid in c}
240
+
241
+ def pastel_palette(k, s=60, l=65):
242
+ return [f"hsl({int(360*i/k)}, {s}%, {l}%)" for i in range(max(1, k))]
243
+ comm_colors = pastel_palette(len(comms))
244
+ group_colors = {str(i): comm_colors[i] for i in range(len(comms))}
245
+
246
+ # ---- Construcción nodos/aristas para vis.js ----
247
+ ns_nodes = top_rows["sim_to_query"].to_numpy(dtype=float)
248
+ smin, smax = (float(ns_nodes.min()), float(ns_nodes.max())) if ns_nodes.size else (0.0, 0.0)
249
+
250
+ def node_size(sim):
251
+ if smax <= smin: return 18
252
+ return 14 + 40 * (float(sim) - smin) / (smax - smin)
253
+
254
+ nodes, edges = [], []
255
+ nodes.append({
256
+ "id": "QUERY", "label": "Consulta", "title": qtxt,
257
+ "shape": "star", "size": 46, "color": "#e45756",
258
+ "font": {"size": 16, "strokeWidth": 6, "strokeColor": "#ffffff"}
259
+ })
260
+
261
+ for _, row in top_rows.iterrows():
262
+ eid = str(row.get("eid", "")) or str(row.name)
263
+ title = str(row.get("title", ""))[:160]
264
+ journal = str(row.get("source_title", ""))[:120]
265
+ year = int(row.get("year", 0)) if pd.notna(row.get("year", None)) else 0
266
+ doi = str(row.get("doi", "")) or ""
267
+ link = str(row.get("link", "")) or ""
268
+ sim = float(row.get("sim_to_query", 0.0))
269
+
270
+ label = (journal or title)[:40] or "Artículo"
271
+ tooltip = (
272
+ f"<b>{title}</b><br>"
273
+ f"Revista: {journal}<br>"
274
+ f"Año: {year if year>0 else 'N/D'}<br>"
275
+ f"Similitud con consulta: {sim:.3f}<br>"
276
+ f"DOI: {doi}<br>"
277
+ f"<a href='{link}' target='_blank'>Abrir</a>"
278
+ )
279
+ group = str(node2comm.get(eid, 0))
280
+ nodes.append({
281
+ "id": eid, "label": label, "title": tooltip,
282
+ "size": node_size(sim), "year": year, "group": group,
283
+ "colorYear": teal_year_color(year),
284
+ "font": {"size": 14, "strokeWidth": 6, "strokeColor": "#ffffff"}
285
+ })
286
+ edges.append({
287
+ "from": "QUERY", "to": eid,
288
+ "value": sim,
289
+ "width": 1 + 6*max(0.0, sim),
290
+ "color": {"color": "#9fb7b3"},
291
+ "smooth": True
292
+ })
293
+
294
+ for i in range(n):
295
+ for j in range(i + 1, n):
296
+ w = float(pair_sims[i, j])
297
+ edges.append({
298
+ "from": ids[i], "to": ids[j],
299
+ "value": w,
300
+ "width": max(0.8, 3.0*(w-0.2)),
301
+ "hidden": w < doc_edge_threshold,
302
+ "color": {"color": "#b9c7c5"},
303
+ "smooth": True
304
+ })
305
+
306
+ options = {
307
+ "interaction": {
308
+ "hover": True, "multiselect": True, "dragNodes": True,
309
+ "navigationButtons": False,
310
+ "keyboard": {"enabled": True, "bindToWindow": True}
311
+ },
312
+ "physics": {
313
+ "enabled": True, "solver": "forceAtlas2Based",
314
+ "forceAtlas2Based": {
315
+ "avoidOverlap": 0.4, "gravitationalConstant": -45,
316
+ "centralGravity": 0.015, "springLength": 135,
317
+ "springConstant": 0.055, "damping": 0.45
318
+ },
319
+ "stabilization": {"iterations": 140}
320
+ },
321
+ "nodes": {
322
+ "shape": "dot", "borderWidth": 1,
323
+ "shadow": {"enabled": True, "size": 8, "x": 0, "y": 1}
324
+ },
325
+ "edges": {
326
+ "smooth": {"type": "continuous"},
327
+ "selectionWidth": 2,
328
+ "shadow": {"enabled": True, "size": 6, "x": 0, "y": 1}
329
+ }
330
+ }
331
+
332
+ tmpl = Template(r"""
333
+ <div style="font-family:system-ui,-apple-system,Segoe UI,Roboto; background:#f6f8f9; padding:8px; border-radius:8px;">
334
+ <div style="display:flex; gap:14px; align-items:center; margin:6px 0 10px 0;">
335
+ <div style="white-space:nowrap;">
336
+ <label><b>Color por:</b></label>
337
+ <label style="margin-left:6px;"><input type="radio" name="colorMode" value="year" checked> Año</label>
338
+ <label style="margin-left:6px;"><input type="radio" name="colorMode" value="community"> Comunidad</label>
339
+ </div>
340
+ <div style="min-width:290px;">
341
+ <label for="edgeSlider"><b>Umbral</b>: <span id="edgeVal">$THRESH</span></label>
342
+ <input id="edgeSlider" type="range" min="0" max="1" step="0.01" value="$THRESH"
343
+ style="width:180px; margin-left:8px;">
344
+ </div>
345
+ </div>
346
+
347
+ <div style="display:flex; align-items:center; gap:10px; margin:2px 0 8px 6px;">
348
+ <div style="width:82px; text-align:right; color:#5b6b70; font-size:12px;">Años:</div>
349
+ <input id="yearMin" type="range" min="$YMIN" max="$YMAX" value="$YMIN" step="1" style="flex:1;">
350
+ <input id="yearMax" type="range" min="$YMIN" max="$YMAX" value="$YMAX" step="1" style="flex:1;">
351
+ <div id="yearLbl" style="width:130px; text-align:left; color:#5b6b70; font-size:12px;">$YMIN – $YMAX</div>
352
+ </div>
353
+ <div style="height:10px; margin:0 6px 8px 90px; background:linear-gradient(90deg, hsl(170,35%,85%) 0%, hsl(170,70%,55%) 100%); border-radius:6px;"></div>
354
+
355
+ <div id="netContainer" style="height:720px; border:1px solid #d6e0e2; border-radius:12px; background:#fbfcfd;"></div>
356
+
357
+ <div style="position:relative; margin-top:6px;">
358
+ <div style="position:absolute; left:6px; bottom:6px; display:flex; gap:8px;">
359
+ <button id="btnFit" title="Ajustar vista" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⟲</button>
360
+ <button id="btnPNG" title="Exportar PNG" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">⬇</button>
361
+ <button id="btnHelp" title="Ayuda" style="border:0; background:#e7f0ef; padding:6px 10px; border-radius:10px;">?</button>
362
+ </div>
363
+ </div>
364
+ </div>
365
+
366
+ <script src="https://unpkg.com/vis-network@9.1.9/dist/vis-network.min.js"></script>
367
+ <script>
368
+ (function(){
369
+ const nodes = new vis.DataSet($NODES);
370
+ const edges = new vis.DataSet($EDGES);
371
+ const options = $OPTIONS;
372
+ const groupColors = $GROUPCOLORS;
373
+
374
+ const container = document.getElementById('netContainer');
375
+ const net = new vis.Network(container, {nodes, edges}, options);
376
+ window.network = net; window.nodes = nodes; window.edges = edges;
377
+
378
+ // Color por año/comunidad
379
+ function applyColors(mode){
380
+ nodes.forEach(n=>{
381
+ if(n.id==='QUERY') return;
382
+ const col = (mode==='community') ? (groupColors[String(n.group)]||'#9fb7b3') : (n.colorYear||'#9fb7b3');
383
+ nodes.update({ id:n.id, color: col });
384
+ });
385
+ }
386
+ applyColors('year');
387
+ document.querySelectorAll('input[name="colorMode"]').forEach(r =>
388
+ r.addEventListener('change', e => applyColors(e.target.value))
389
+ );
390
+
391
+ // Umbral
392
+ const slider = document.getElementById('edgeSlider');
393
+ const edgeVal = document.getElementById('edgeVal');
394
+ function applyThreshold(th){
395
+ edges.forEach(e=>{
396
+ const show = (e.value||0) >= th || e.from==='QUERY' || e.to==='QUERY';
397
+ edges.update({ id:e.id, hidden:!show, width: show ? (e.width||1) : 0.1 });
398
+ });
399
+ }
400
+ slider.addEventListener('input', ()=>{
401
+ const th = parseFloat(slider.value||'0');
402
+ edgeVal.textContent = th.toFixed(2);
403
+ applyThreshold(th);
404
+ });
405
+ applyThreshold(parseFloat(slider.value||'0'));
406
+
407
+ // Timeline (doble slider)
408
+ const sMin=document.getElementById('yearMin'), sMax=document.getElementById('yearMax');
409
+ const yLbl=document.getElementById('yearLbl');
410
+ function applyYearFilter(a,b){
411
+ const lo=Math.min(a,b), hi=Math.max(a,b);
412
+ yLbl.textContent = lo+" – "+hi;
413
+ const visible=new Set();
414
+ nodes.forEach(n=>{
415
+ if(n.id==='QUERY'){ visible.add(n.id); return; }
416
+ const y=Number(n.year||0);
417
+ const show=(y===0)||(y>=lo && y<=hi);
418
+ nodes.update({ id:n.id, hidden:!show });
419
+ if(show) visible.add(n.id);
420
+ });
421
+ edges.forEach(e=>{
422
+ const show=visible.has(e.from)&&visible.has(e.to);
423
+ edges.update({ id:e.id, hidden:!show });
424
+ });
425
+ }
426
+ function clamp(){ let a=+sMin.value, b=+sMax.value; if(a>b) [a,b]=[b,a]; applyYearFilter(a,b); }
427
+ sMin.addEventListener('input',clamp); sMax.addEventListener('input',clamp); clamp();
428
+
429
+ // Resaltado de vecindad
430
+ const inactive='rgba(200,210,210,0.35)';
431
+ function highlight(ids){
432
+ const nbr=new Set(ids);
433
+ ids.forEach(id=> net.getConnectedNodes(id).forEach(n=>nbr.add(n)));
434
+ nodes.forEach(n=>{
435
+ const active=nbr.has(n.id)||n.id==='QUERY';
436
+ nodes.update({ id:n.id, color: active?(n.color||'#9fb7b3'):inactive });
437
+ });
438
+ }
439
+ net.on('selectNode', p=>highlight(p.nodes));
440
+ net.on('deselectNode', ()=>applyColors(document.querySelector('input[name="colorMode"]:checked').value));
441
+
442
+ // Botones
443
+ document.getElementById('btnFit').onclick = () => net.fit({animation: true});
444
+ document.getElementById('btnPNG').onclick = () => {
445
+ const url = net.canvas.frame.canvas.toDataURL('image/png');
446
+ const a = document.createElement('a'); a.href = url; a.download = 'graph.png'; a.click();
447
+ };
448
+ document.getElementById('btnHelp').onclick = () => alert(
449
+ "Usa: Color por Año/Comunidad • Umbral de arista • Rango de años • Clic para resaltar vecindad • Doble clic abre el enlace (tooltip)."
450
+ );
451
+
452
+ // Doble clic abre enlace/DOI si existe
453
+ net.on('doubleClick', (p) => {
454
+ if (p.nodes && p.nodes.length===1){
455
+ const n = nodes.get(p.nodes[0]);
456
+ if (n && n.title) {
457
+ const tmp = document.createElement('div'); tmp.innerHTML = n.title;
458
+ const a = tmp.querySelector('a'); if (a && a.href) window.open(a.href, '_blank');
459
+ }
460
+ }
461
+ });
462
+ })();
463
+ </script>
464
+ """)
465
+
466
+ html = tmpl.substitute(
467
+ NODES=json.dumps(nodes),
468
+ EDGES=json.dumps(edges),
469
+ OPTIONS=json.dumps(options),
470
+ GROUPCOLORS=json.dumps(group_colors),
471
+ YMIN=y_min,
472
+ YMAX=y_max,
473
+ THRESH=f"{doc_edge_threshold:.2f}",
474
+ )
475
+
476
+ b64 = base64.b64encode(html.encode("utf-8")).decode("ascii")
477
+ return (
478
+ f'<iframe src="data:text/html;charset=utf-8;base64,{b64}" '
479
+ f'style="width:100%;height:820px;border:0;" '
480
+ f'sandbox="allow-scripts allow-same-origin allow-popups"></iframe>'
481
+ )
482
+
483
+ # =========================
484
+ # UI Gradio
485
+ # =========================
486
+ with gr.Blocks(title="Recomendador de Revistas (Scopus)") as demo:
487
+ gr.Markdown("## Recomendación de revistas UPTC")
488
+
489
+ # --- Entrada principal ---
490
+ with gr.Row():
491
+ query = gr.Textbox(
492
+ label="Título o idea de investigación",
493
+ lines=3,
494
+ placeholder="Ej.: Detección temprana de fallas en motores usando aprendizaje profundo…"
495
+ )
496
+
497
+ # --- Filtros de año ---
498
+ with gr.Row():
499
+ min_year = gr.Textbox(label="Año mínimo (opcional)", placeholder="2019")
500
+ max_year = gr.Textbox(label="Año máximo (opcional)", placeholder="2025")
501
+
502
+ # --- Top-k y nº de revistas ---
503
+ with gr.Row():
504
+ k_articles = gr.Slider(50, 1000, value=300, step=50, label="Artículos considerados (top-k)")
505
+ top_n = gr.Slider(5, 20, value=10, step=1, label="Nº de revistas a mostrar")
506
+
507
+ # --- Fusionar con SPECTER ---
508
+ with gr.Row():
509
+ use_specter = gr.Checkbox(
510
+ label="Fusionar con SPECTER (mejor afinidad científica)",
511
+ value=SPECTER_AVAILABLE
512
+ )
513
+ alpha_e5 = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Peso E5 (1−α = SPECTER)")
514
+
515
+ # --- BOTONES: SIEMPRE DEBAJO DE FUSIÓN ---
516
+ with gr.Row():
517
+ btn = gr.Button("Recomendar")
518
+ btn_net = gr.Button("Ver red de similitud")
519
+
520
+ # --- SALIDAS (van DESPUÉS, así los botones quedan fijos arriba de ellas) ---
521
+ out = gr.Dataframe(
522
+ row_count=10, wrap=True,
523
+ column_widths=[180, 60, 90, 90, 90, 90, 90, 250, 120, 120, 120, 100],
524
+ label="Revistas recomendadas"
525
+ )
526
+ out_net_html = gr.HTML(label="Grafo interactivo (explorable)")
527
+
528
+ # --- Acciones (pueden declararse después de crear 'out' y 'out_net_html') ---
529
+ btn.click(
530
+ fn=recommend,
531
+ inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
532
+ outputs=out
533
+ )
534
+ query.submit(
535
+ fn=recommend,
536
+ inputs=[query, k_articles, top_n, min_year, max_year, use_specter, alpha_e5],
537
+ outputs=out
538
+ )
539
+ btn_net.click(
540
+ fn=lambda q, ka, y0, y1, us, a: build_similarity_network_html(
541
+ q, ka, y0, y1, use_specter=us, alpha_e5=a, top_nodes=15, doc_edge_threshold=0.35
542
+ ),
543
+ inputs=[query, k_articles, min_year, max_year, use_specter, alpha_e5],
544
+ outputs=[out_net_html]
545
+ )
546
+
547
+ if __name__ == "__main__":
548
+ demo.launch()
scopus.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scopus_simple_extract.py
2
+ # Extrae resultados Scopus por AF-ID y exporta UN CSV "amigable" con campos básicos.
3
+ # NO pide Abstract, Autores, Keywords, Funding, Conference, etc.
4
+
5
+ import time
6
+ import argparse
7
+ import urllib.parse as urlparse
8
+ from typing import Dict, List, Optional
9
+ import requests
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ BASE_URL_SEARCH = "https://api.elsevier.com/content/search/scopus"
14
+
15
+ # -------------------------
16
+ # HTTP utilidades
17
+ # -------------------------
18
+ def build_headers(api_key: str, insttoken: Optional[str] = None) -> Dict[str, str]:
19
+ h = {"Accept": "application/json", "X-ELS-APIKey": api_key.strip()}
20
+ if insttoken:
21
+ h["X-ELS-Insttoken"] = insttoken.strip()
22
+ return h
23
+
24
+ def get_json(session: requests.Session, url: str, params: Dict[str, str],
25
+ headers: Dict[str, str], max_retries: int = 6, sleep_base: float = 0.75) -> Dict:
26
+ """
27
+ GET con reintentos para 429/5xx. Si 401 por Insttoken mal pareado, reintenta SIN Insttoken.
28
+ """
29
+ last_exc = None
30
+ tried_without_token = False
31
+
32
+ for t in range(max_retries + 1):
33
+ try:
34
+ r = session.get(url, params=params, headers=headers, timeout=90)
35
+ except Exception as ex:
36
+ last_exc = ex
37
+ time.sleep((2 ** t) * sleep_base)
38
+ continue
39
+
40
+ if r.status_code in (429, 500, 502, 503, 504):
41
+ time.sleep((2 ** t) * sleep_base)
42
+ continue
43
+
44
+ if r.status_code == 401:
45
+ # intentar una sola vez sin Insttoken si el problema es token no asociado
46
+ try:
47
+ j = r.json()
48
+ except Exception:
49
+ j = {}
50
+ if ("Institution Token is not associated with API Key" in str(j)
51
+ and not tried_without_token
52
+ and "X-ELS-Insttoken" in headers):
53
+ tried_without_token = True
54
+ h2 = dict(headers)
55
+ h2.pop("X-ELS-Insttoken", None)
56
+ r2 = session.get(url, params=params, headers=h2, timeout=90)
57
+ if r2.ok:
58
+ try:
59
+ return r2.json()
60
+ except Exception:
61
+ raise RuntimeError("La respuesta no es JSON decodificable.")
62
+ else:
63
+ try:
64
+ j2 = r2.json()
65
+ except Exception:
66
+ j2 = {}
67
+ raise RuntimeError(f"HTTP {r2.status_code} – {j2 or r2.text}")
68
+
69
+ if not r.ok:
70
+ try:
71
+ j = r.json()
72
+ except Exception:
73
+ j = {}
74
+ raise RuntimeError(f"HTTP {r.status_code} – {j or r.text}")
75
+
76
+ try:
77
+ return r.json()
78
+ except Exception:
79
+ raise RuntimeError("La respuesta no es JSON decodificable.")
80
+
81
+ if last_exc:
82
+ raise RuntimeError(f"Error de red persistente: {last_exc}")
83
+ raise RuntimeError("No se obtuvo respuesta estable tras varios reintentos.")
84
+
85
+ # -------------------------
86
+ # Paginación Search API
87
+ # -------------------------
88
+ def extract_by_year_cursor(session: requests.Session, headers: Dict[str, str],
89
+ afid: str, year: int, page_size: int, view: str) -> List[Dict]:
90
+ params = {
91
+ "query": f"AF-ID({afid}) AND PUBYEAR = {year}",
92
+ "view": view,
93
+ "count": str(page_size),
94
+ "cursor": "*",
95
+ }
96
+ entries: List[Dict] = []
97
+ while True:
98
+ j = get_json(session, BASE_URL_SEARCH, params, headers)
99
+ chunk = j.get("search-results", {}).get("entry", []) or []
100
+ if chunk:
101
+ entries.extend(chunk)
102
+
103
+ next_token = None
104
+ for ln in j.get("search-results", {}).get("link", []) or []:
105
+ if ln.get("@ref") == "next":
106
+ href = ln.get("@href")
107
+ if href:
108
+ q = urlparse.urlparse(href).query
109
+ qd = urlparse.parse_qs(q)
110
+ next_token = (qd.get("cursor") or [None])[0]
111
+ break
112
+ if not next_token:
113
+ break
114
+ params["cursor"] = next_token
115
+ return entries
116
+
117
+ def extract_by_year_startcount(session: requests.Session, headers: Dict[str, str],
118
+ afid: str, year: int, page_size: int, view: str,
119
+ hard_limit: int = 20000) -> List[Dict]:
120
+ entries: List[Dict] = []
121
+ start = 0
122
+ while start < hard_limit:
123
+ params = {
124
+ "query": f"AF-ID({afid}) AND PUBYEAR = {year}",
125
+ "view": view,
126
+ "count": str(page_size),
127
+ "start": str(start),
128
+ }
129
+ j = get_json(session, BASE_URL_SEARCH, params, headers)
130
+ chunk = j.get("search-results", {}).get("entry", []) or []
131
+ if not chunk:
132
+ break
133
+ entries.extend(chunk)
134
+ if len(chunk) < page_size:
135
+ break
136
+ start += page_size
137
+ return entries
138
+
139
+ def extract_no_year(session: requests.Session, headers: Dict[str, str],
140
+ afid: str, page_size: int, view: str, use_cursor: bool) -> List[Dict]:
141
+ entries: List[Dict] = []
142
+ if use_cursor:
143
+ params = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "cursor": "*"}
144
+ while True:
145
+ j = get_json(session, BASE_URL_SEARCH, params, headers)
146
+ chunk = j.get("search-results", {}).get("entry", []) or []
147
+ if chunk:
148
+ entries.extend(chunk)
149
+ next_token = None
150
+ for ln in j.get("search-results", {}).get("link", []) or []:
151
+ if ln.get("@ref") == "next":
152
+ href = ln.get("@href")
153
+ if href:
154
+ q = urlparse.urlparse(href).query
155
+ qd = urlparse.parse_qs(q)
156
+ next_token = (qd.get("cursor") or [None])[0]
157
+ break
158
+ if not next_token:
159
+ break
160
+ params["cursor"] = next_token
161
+ else:
162
+ start = 0
163
+ while True:
164
+ params_sc = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "start": str(start)}
165
+ j = get_json(session, BASE_URL_SEARCH, params_sc, headers)
166
+ chunk = j.get("search-results", {}).get("entry", []) or []
167
+ if not chunk:
168
+ break
169
+ entries.extend(chunk)
170
+ if len(chunk) < page_size:
171
+ break
172
+ start += page_size
173
+ return entries
174
+
175
+ # -------------------------
176
+ # Normalización básica (sin autores/abstract/keywords/funding/etc.)
177
+ # -------------------------
178
+ TOP_FIELD_MAP = {
179
+ "dc:title": "title",
180
+ # NO pedimos abstract ni keywords
181
+ "prism:coverDate": "coverDate",
182
+ "prism:doi": "doi",
183
+ "prism:publicationName": "sourceTitle",
184
+ "prism:issn": "issn",
185
+ "prism:eIssn": "eIssn",
186
+ "prism:volume": "volume",
187
+ "prism:issueIdentifier": "issue",
188
+ "prism:pageRange": "pages",
189
+ "citedby-count": "citedBy",
190
+ "subtype": "subtype",
191
+ "subtypeDescription": "subtypeDesc",
192
+ "openaccessFlag": "openAccess",
193
+ "dc:identifier": "identifier",
194
+ "eid": "eid",
195
+ "prism:url": "prismUrl",
196
+ }
197
+
198
+ def links_to_dict(links: List[Dict]) -> Dict[str, str]:
199
+ d = {}
200
+ for ln in links or []:
201
+ ref = ln.get("@ref")
202
+ href = ln.get("@href")
203
+ if ref and href:
204
+ d[f"link_{ref}"] = href
205
+ return d
206
+
207
+ def normalize_entries(entries: List[Dict]) -> pd.DataFrame:
208
+ rows: List[Dict] = []
209
+ for e in entries:
210
+ row = {}
211
+ for k_src, k_dst in TOP_FIELD_MAP.items():
212
+ if k_src in e:
213
+ row[k_dst] = e.get(k_src)
214
+ row.update(links_to_dict(e.get("link")))
215
+ rows.append(row)
216
+
217
+ df = pd.DataFrame(rows)
218
+ if not df.empty:
219
+ if "coverDate" in df.columns:
220
+ df["coverDate"] = pd.to_datetime(df["coverDate"], errors="coerce")
221
+ subset_cols = [c for c in ["eid", "identifier"] if c in df.columns]
222
+ if subset_cols:
223
+ df = df.drop_duplicates(subset=subset_cols, keep="first")
224
+ return df
225
+
226
+ # -------------------------
227
+ # Fallbacks de vista/paginación
228
+ # -------------------------
229
+ def try_extract_year(session, headers, afid, year, page_size, view, use_cursor) -> List[Dict]:
230
+ def do_extract(ps, cur, v):
231
+ if cur:
232
+ return extract_by_year_cursor(session, headers, afid, year, ps, v)
233
+ else:
234
+ return extract_by_year_startcount(session, headers, afid, year, ps, v)
235
+ try:
236
+ return do_extract(page_size, use_cursor, view)
237
+ except RuntimeError as e:
238
+ msg = str(e)
239
+ if "AUTHORIZATION_ERROR" in msg:
240
+ fallback = "STANDARD" if view == "COMPLETE" else ("BASIC" if view == "STANDARD" else None)
241
+ if fallback:
242
+ return do_extract(page_size, use_cursor, fallback)
243
+ raise
244
+ if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
245
+ # reduce page size y quita cursor
246
+ return do_extract(25, False, view)
247
+ if use_cursor:
248
+ return do_extract(page_size, False, view)
249
+ raise
250
+
251
+ def try_extract_no_year(session, headers, afid, page_size, view, use_cursor) -> List[Dict]:
252
+ try:
253
+ return extract_no_year(session, headers, afid, page_size, view, use_cursor)
254
+ except RuntimeError as e:
255
+ msg = str(e)
256
+ if "AUTHORIZATION_ERROR" in msg:
257
+ if view == "COMPLETE": # bajar a STANDARD/BASIC
258
+ return extract_no_year(session, headers, afid, page_size, "STANDARD", use_cursor)
259
+ if view == "STANDARD":
260
+ return extract_no_year(session, headers, afid, page_size, "BASIC", use_cursor)
261
+ raise
262
+ if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
263
+ return extract_no_year(session, headers, afid, 25, view, False)
264
+ if use_cursor:
265
+ return extract_no_year(session, headers, afid, page_size, view, False)
266
+ raise
267
+
268
+ def fetch_scopus_affiliation(api_key: str,
269
+ afid: str = "60077378",
270
+ year_start: Optional[int] = 2020,
271
+ year_end: Optional[int] = 2024,
272
+ view: str = "STANDARD",
273
+ page_size: int = 100,
274
+ insttoken: Optional[str] = None,
275
+ use_cursor: bool = True) -> List[Dict]:
276
+ headers = build_headers(api_key, insttoken)
277
+ session = requests.Session()
278
+ if year_start is None or year_end is None:
279
+ return try_extract_no_year(session, headers, afid, page_size, view, use_cursor)
280
+ entries: List[Dict] = []
281
+ for yr in range(int(year_start), int(year_end) + 1):
282
+ entries.extend(try_extract_year(session, headers, afid, yr, page_size, view, use_cursor))
283
+ return entries
284
+
285
+ # -------------------------
286
+ # Export UN SOLO CSV (ligero)
287
+ # -------------------------
288
+ EXPORT_COLUMNS = [
289
+ "Title","Year","Source title","Volume","Issue",
290
+ "Page start","Page end","Page count",
291
+ "Cited by","DOI","Link","ISSN","eISSN","Document Type","Open Access","EID"
292
+ ]
293
+
294
+ def _pick_link(row: pd.Series) -> str:
295
+ for c in ("prismUrl","link_scopus","prism:url","link_self"):
296
+ if c in row and pd.notna(row[c]) and str(row[c]).strip():
297
+ return str(row[c])
298
+ return ""
299
+
300
+ def pick_col(df: pd.DataFrame, primary: str, secondary: str, default: str = "") -> pd.Series:
301
+ """Fallback por fila: usa primary; si está vacío/NaN, toma secondary."""
302
+ n = len(df)
303
+ s1 = df[primary] if primary in df.columns else pd.Series([np.nan] * n, index=df.index)
304
+ s2 = df[secondary] if secondary in df.columns else pd.Series([default] * n, index=df.index)
305
+ s1 = s1.copy()
306
+ mask = s1.isna() | (s1.astype(str).str.strip() == "")
307
+ s1.loc[mask] = s2.loc[mask]
308
+ return s1.fillna(default)
309
+
310
+ def make_export(df: pd.DataFrame) -> pd.DataFrame:
311
+ # Derivar Year y páginas
312
+ out = df.copy()
313
+
314
+ if "coverDate" in out.columns:
315
+ out["Year"] = pd.to_datetime(out["coverDate"], errors="coerce").dt.year
316
+ else:
317
+ out["Year"] = ""
318
+
319
+ out["Page start"], out["Page end"], out["Page count"] = "", "", ""
320
+ if "pages" in out.columns:
321
+ starts, ends, counts = [], [], []
322
+ for x in out["pages"].fillna(""):
323
+ if "-" in x:
324
+ a, b = x.split("-", 1)
325
+ a_num = "".join(ch for ch in a if ch.isdigit())
326
+ b_num = "".join(ch for ch in b if ch.isdigit())
327
+ starts.append(a_num); ends.append(b_num)
328
+ try:
329
+ counts.append(str(max(0, int(b_num) - int(a_num) + 1)) if a_num and b_num else "")
330
+ except Exception:
331
+ counts.append("")
332
+ else:
333
+ starts.append(""); ends.append(""); counts.append("")
334
+ out["Page start"], out["Page end"], out["Page count"] = starts, ends, counts
335
+
336
+ # Link preferido
337
+ out["Link"] = out.apply(_pick_link, axis=1)
338
+
339
+ # Ensamblar columnas finales (usando pick_col para evitar 'Series' ambiguas)
340
+ final = pd.DataFrame()
341
+ final["Title"] = out.get("title", "")
342
+ final["Year"] = out.get("Year", "")
343
+
344
+ final["Source title"] = pick_col(out, "sourceTitle", "prism:publicationName")
345
+ final["Volume"] = pick_col(out, "volume", "prism:volume")
346
+ final["Issue"] = pick_col(out, "issue", "prism:issueIdentifier")
347
+
348
+ final["Page start"] = out["Page start"]
349
+ final["Page end"] = out["Page end"]
350
+ final["Page count"] = out["Page count"]
351
+
352
+ final["Cited by"] = pick_col(out, "citedBy", "citedby-count")
353
+ final["DOI"] = pick_col(out, "doi", "prism:doi")
354
+ final["Link"] = out["Link"]
355
+
356
+ final["ISSN"] = pick_col(out, "issn", "prism:issn")
357
+ final["eISSN"] = pick_col(out, "eIssn", "prism:eIssn")
358
+
359
+ final["Document Type"] = pick_col(out, "subtypeDesc", "subtypeDescription")
360
+ final["Open Access"] = pick_col(out, "openAccess", "openaccessFlag")
361
+
362
+ final["EID"] = out.get("eid", "")
363
+
364
+ # Ordenar por año descendente (coaccionando a numérico para evitar mezclas str/int)
365
+ final["Year"] = pd.to_numeric(final["Year"], errors="coerce")
366
+ final = final.sort_values(by="Year", ascending=False, na_position="last")
367
+
368
+ # Reordenar/filtrar columnas
369
+ return final[EXPORT_COLUMNS]
370
+
371
+ # -------------------------
372
+ # CLI
373
+ # -------------------------
374
+ def parse_args():
375
+ p = argparse.ArgumentParser(description="Extrae publicaciones Scopus por AF-ID y exporta UN CSV básico (sin autores/abstract/etc.).")
376
+ p.add_argument("--api-key", required=True, help="X-ELS-APIKey")
377
+ p.add_argument("--insttoken", default=None, help="X-ELS-Insttoken (opcional)")
378
+ p.add_argument("--afid", default="60077378", help="Scopus Affiliation ID (AF-ID)")
379
+ p.add_argument("--year-start", default=2020, help="Año inicial o 'None'")
380
+ p.add_argument("--year-end", default=2024, help="Año final o 'None'")
381
+ p.add_argument("--view", default="STANDARD", choices=["BASIC", "STANDARD", "COMPLETE"], help="Vista del Search API")
382
+ p.add_argument("--page-size", type=int, default=100, help="Tamaño de página (25..200)")
383
+ p.add_argument("--use-cursor", action="store_true", help="Usar cursor pagination")
384
+ p.add_argument("--no-cursor", dest="use_cursor", action="store_false", help="Usar start/count")
385
+ p.set_defaults(use_cursor=True)
386
+ p.add_argument("--out-prefix", default="scopus_afid", help="Prefijo de salida")
387
+ return p.parse_args()
388
+
389
+ def main():
390
+ args = parse_args()
391
+
392
+ def norm_year(x):
393
+ sx = str(x).strip().lower()
394
+ return None if sx == "none" else int(x)
395
+ y0 = norm_year(args.year_start)
396
+ y1 = norm_year(args.year_end)
397
+
398
+ print("Descargando desde Scopus (Search API)…")
399
+ entries = fetch_scopus_affiliation(
400
+ api_key=args.api_key,
401
+ afid=args.afid,
402
+ year_start=y0,
403
+ year_end=y1,
404
+ view=args.view,
405
+ page_size=args.page_size,
406
+ insttoken=args.insttoken,
407
+ use_cursor=args.use_cursor
408
+ )
409
+ print(f"Entradas obtenidas: {len(entries)}")
410
+
411
+ df = normalize_entries(entries)
412
+ export_df = make_export(df)
413
+ out_csv = f"{args.out_prefix}_scopus_export.csv"
414
+ export_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
415
+ print(f"Listo: {out_csv}")
416
+
417
+ if __name__ == "__main__":
418
+ main()
scopus_corpus.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149ba9d029b6969dedcab02f95d6c7c77897fb7470581a3e437d545ea3af2530
3
+ size 1443176
scopus_corpus_with_specter.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7442d3f19ba7f2d7685506abf8030951ed1486628b5960ff21eea009f9b533c9
3
+ size 4668034