|
|
|
|
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
import argparse
|
|
|
import urllib.parse as urlparse
|
|
|
from typing import Dict, List, Optional
|
|
|
import requests
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
BASE_URL_SEARCH = "https://api.elsevier.com/content/search/scopus"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_headers(api_key: str, insttoken: Optional[str] = None) -> Dict[str, str]:
|
|
|
h = {"Accept": "application/json", "X-ELS-APIKey": api_key.strip()}
|
|
|
if insttoken:
|
|
|
h["X-ELS-Insttoken"] = insttoken.strip()
|
|
|
return h
|
|
|
|
|
|
def get_json(session: requests.Session, url: str, params: Dict[str, str],
|
|
|
headers: Dict[str, str], max_retries: int = 6, sleep_base: float = 0.75) -> Dict:
|
|
|
"""
|
|
|
GET con reintentos para 429/5xx. Si 401 por Insttoken mal pareado, reintenta SIN Insttoken.
|
|
|
"""
|
|
|
last_exc = None
|
|
|
tried_without_token = False
|
|
|
|
|
|
for t in range(max_retries + 1):
|
|
|
try:
|
|
|
r = session.get(url, params=params, headers=headers, timeout=90)
|
|
|
except Exception as ex:
|
|
|
last_exc = ex
|
|
|
time.sleep((2 ** t) * sleep_base)
|
|
|
continue
|
|
|
|
|
|
if r.status_code in (429, 500, 502, 503, 504):
|
|
|
time.sleep((2 ** t) * sleep_base)
|
|
|
continue
|
|
|
|
|
|
if r.status_code == 401:
|
|
|
|
|
|
try:
|
|
|
j = r.json()
|
|
|
except Exception:
|
|
|
j = {}
|
|
|
if ("Institution Token is not associated with API Key" in str(j)
|
|
|
and not tried_without_token
|
|
|
and "X-ELS-Insttoken" in headers):
|
|
|
tried_without_token = True
|
|
|
h2 = dict(headers)
|
|
|
h2.pop("X-ELS-Insttoken", None)
|
|
|
r2 = session.get(url, params=params, headers=h2, timeout=90)
|
|
|
if r2.ok:
|
|
|
try:
|
|
|
return r2.json()
|
|
|
except Exception:
|
|
|
raise RuntimeError("La respuesta no es JSON decodificable.")
|
|
|
else:
|
|
|
try:
|
|
|
j2 = r2.json()
|
|
|
except Exception:
|
|
|
j2 = {}
|
|
|
raise RuntimeError(f"HTTP {r2.status_code} – {j2 or r2.text}")
|
|
|
|
|
|
if not r.ok:
|
|
|
try:
|
|
|
j = r.json()
|
|
|
except Exception:
|
|
|
j = {}
|
|
|
raise RuntimeError(f"HTTP {r.status_code} – {j or r.text}")
|
|
|
|
|
|
try:
|
|
|
return r.json()
|
|
|
except Exception:
|
|
|
raise RuntimeError("La respuesta no es JSON decodificable.")
|
|
|
|
|
|
if last_exc:
|
|
|
raise RuntimeError(f"Error de red persistente: {last_exc}")
|
|
|
raise RuntimeError("No se obtuvo respuesta estable tras varios reintentos.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_by_year_cursor(session: requests.Session, headers: Dict[str, str],
|
|
|
afid: str, year: int, page_size: int, view: str) -> List[Dict]:
|
|
|
params = {
|
|
|
"query": f"AF-ID({afid}) AND PUBYEAR = {year}",
|
|
|
"view": view,
|
|
|
"count": str(page_size),
|
|
|
"cursor": "*",
|
|
|
}
|
|
|
entries: List[Dict] = []
|
|
|
while True:
|
|
|
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
|
|
chunk = j.get("search-results", {}).get("entry", []) or []
|
|
|
if chunk:
|
|
|
entries.extend(chunk)
|
|
|
|
|
|
next_token = None
|
|
|
for ln in j.get("search-results", {}).get("link", []) or []:
|
|
|
if ln.get("@ref") == "next":
|
|
|
href = ln.get("@href")
|
|
|
if href:
|
|
|
q = urlparse.urlparse(href).query
|
|
|
qd = urlparse.parse_qs(q)
|
|
|
next_token = (qd.get("cursor") or [None])[0]
|
|
|
break
|
|
|
if not next_token:
|
|
|
break
|
|
|
params["cursor"] = next_token
|
|
|
return entries
|
|
|
|
|
|
def extract_by_year_startcount(session: requests.Session, headers: Dict[str, str],
|
|
|
afid: str, year: int, page_size: int, view: str,
|
|
|
hard_limit: int = 20000) -> List[Dict]:
|
|
|
entries: List[Dict] = []
|
|
|
start = 0
|
|
|
while start < hard_limit:
|
|
|
params = {
|
|
|
"query": f"AF-ID({afid}) AND PUBYEAR = {year}",
|
|
|
"view": view,
|
|
|
"count": str(page_size),
|
|
|
"start": str(start),
|
|
|
}
|
|
|
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
|
|
chunk = j.get("search-results", {}).get("entry", []) or []
|
|
|
if not chunk:
|
|
|
break
|
|
|
entries.extend(chunk)
|
|
|
if len(chunk) < page_size:
|
|
|
break
|
|
|
start += page_size
|
|
|
return entries
|
|
|
|
|
|
def extract_no_year(session: requests.Session, headers: Dict[str, str],
|
|
|
afid: str, page_size: int, view: str, use_cursor: bool) -> List[Dict]:
|
|
|
entries: List[Dict] = []
|
|
|
if use_cursor:
|
|
|
params = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "cursor": "*"}
|
|
|
while True:
|
|
|
j = get_json(session, BASE_URL_SEARCH, params, headers)
|
|
|
chunk = j.get("search-results", {}).get("entry", []) or []
|
|
|
if chunk:
|
|
|
entries.extend(chunk)
|
|
|
next_token = None
|
|
|
for ln in j.get("search-results", {}).get("link", []) or []:
|
|
|
if ln.get("@ref") == "next":
|
|
|
href = ln.get("@href")
|
|
|
if href:
|
|
|
q = urlparse.urlparse(href).query
|
|
|
qd = urlparse.parse_qs(q)
|
|
|
next_token = (qd.get("cursor") or [None])[0]
|
|
|
break
|
|
|
if not next_token:
|
|
|
break
|
|
|
params["cursor"] = next_token
|
|
|
else:
|
|
|
start = 0
|
|
|
while True:
|
|
|
params_sc = {"query": f"AF-ID({afid})", "view": view, "count": str(page_size), "start": str(start)}
|
|
|
j = get_json(session, BASE_URL_SEARCH, params_sc, headers)
|
|
|
chunk = j.get("search-results", {}).get("entry", []) or []
|
|
|
if not chunk:
|
|
|
break
|
|
|
entries.extend(chunk)
|
|
|
if len(chunk) < page_size:
|
|
|
break
|
|
|
start += page_size
|
|
|
return entries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOP_FIELD_MAP = {
|
|
|
"dc:title": "title",
|
|
|
|
|
|
"prism:coverDate": "coverDate",
|
|
|
"prism:doi": "doi",
|
|
|
"prism:publicationName": "sourceTitle",
|
|
|
"prism:issn": "issn",
|
|
|
"prism:eIssn": "eIssn",
|
|
|
"prism:volume": "volume",
|
|
|
"prism:issueIdentifier": "issue",
|
|
|
"prism:pageRange": "pages",
|
|
|
"citedby-count": "citedBy",
|
|
|
"subtype": "subtype",
|
|
|
"subtypeDescription": "subtypeDesc",
|
|
|
"openaccessFlag": "openAccess",
|
|
|
"dc:identifier": "identifier",
|
|
|
"eid": "eid",
|
|
|
"prism:url": "prismUrl",
|
|
|
}
|
|
|
|
|
|
def links_to_dict(links: List[Dict]) -> Dict[str, str]:
|
|
|
d = {}
|
|
|
for ln in links or []:
|
|
|
ref = ln.get("@ref")
|
|
|
href = ln.get("@href")
|
|
|
if ref and href:
|
|
|
d[f"link_{ref}"] = href
|
|
|
return d
|
|
|
|
|
|
def normalize_entries(entries: List[Dict]) -> pd.DataFrame:
|
|
|
rows: List[Dict] = []
|
|
|
for e in entries:
|
|
|
row = {}
|
|
|
for k_src, k_dst in TOP_FIELD_MAP.items():
|
|
|
if k_src in e:
|
|
|
row[k_dst] = e.get(k_src)
|
|
|
row.update(links_to_dict(e.get("link")))
|
|
|
rows.append(row)
|
|
|
|
|
|
df = pd.DataFrame(rows)
|
|
|
if not df.empty:
|
|
|
if "coverDate" in df.columns:
|
|
|
df["coverDate"] = pd.to_datetime(df["coverDate"], errors="coerce")
|
|
|
subset_cols = [c for c in ["eid", "identifier"] if c in df.columns]
|
|
|
if subset_cols:
|
|
|
df = df.drop_duplicates(subset=subset_cols, keep="first")
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def try_extract_year(session, headers, afid, year, page_size, view, use_cursor) -> List[Dict]:
|
|
|
def do_extract(ps, cur, v):
|
|
|
if cur:
|
|
|
return extract_by_year_cursor(session, headers, afid, year, ps, v)
|
|
|
else:
|
|
|
return extract_by_year_startcount(session, headers, afid, year, ps, v)
|
|
|
try:
|
|
|
return do_extract(page_size, use_cursor, view)
|
|
|
except RuntimeError as e:
|
|
|
msg = str(e)
|
|
|
if "AUTHORIZATION_ERROR" in msg:
|
|
|
fallback = "STANDARD" if view == "COMPLETE" else ("BASIC" if view == "STANDARD" else None)
|
|
|
if fallback:
|
|
|
return do_extract(page_size, use_cursor, fallback)
|
|
|
raise
|
|
|
if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
|
|
|
|
|
|
return do_extract(25, False, view)
|
|
|
if use_cursor:
|
|
|
return do_extract(page_size, False, view)
|
|
|
raise
|
|
|
|
|
|
def try_extract_no_year(session, headers, afid, page_size, view, use_cursor) -> List[Dict]:
|
|
|
try:
|
|
|
return extract_no_year(session, headers, afid, page_size, view, use_cursor)
|
|
|
except RuntimeError as e:
|
|
|
msg = str(e)
|
|
|
if "AUTHORIZATION_ERROR" in msg:
|
|
|
if view == "COMPLETE":
|
|
|
return extract_no_year(session, headers, afid, page_size, "STANDARD", use_cursor)
|
|
|
if view == "STANDARD":
|
|
|
return extract_no_year(session, headers, afid, page_size, "BASIC", use_cursor)
|
|
|
raise
|
|
|
if "INVALID_INPUT" in msg and "maximum number allowed for the service level" in msg:
|
|
|
return extract_no_year(session, headers, afid, 25, view, False)
|
|
|
if use_cursor:
|
|
|
return extract_no_year(session, headers, afid, page_size, view, False)
|
|
|
raise
|
|
|
|
|
|
def fetch_scopus_affiliation(api_key: str,
|
|
|
afid: str = "60077378",
|
|
|
year_start: Optional[int] = 2020,
|
|
|
year_end: Optional[int] = 2024,
|
|
|
view: str = "STANDARD",
|
|
|
page_size: int = 100,
|
|
|
insttoken: Optional[str] = None,
|
|
|
use_cursor: bool = True) -> List[Dict]:
|
|
|
headers = build_headers(api_key, insttoken)
|
|
|
session = requests.Session()
|
|
|
if year_start is None or year_end is None:
|
|
|
return try_extract_no_year(session, headers, afid, page_size, view, use_cursor)
|
|
|
entries: List[Dict] = []
|
|
|
for yr in range(int(year_start), int(year_end) + 1):
|
|
|
entries.extend(try_extract_year(session, headers, afid, yr, page_size, view, use_cursor))
|
|
|
return entries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXPORT_COLUMNS = [
|
|
|
"Title","Year","Source title","Volume","Issue",
|
|
|
"Page start","Page end","Page count",
|
|
|
"Cited by","DOI","Link","ISSN","eISSN","Document Type","Open Access","EID"
|
|
|
]
|
|
|
|
|
|
def _pick_link(row: pd.Series) -> str:
|
|
|
for c in ("prismUrl","link_scopus","prism:url","link_self"):
|
|
|
if c in row and pd.notna(row[c]) and str(row[c]).strip():
|
|
|
return str(row[c])
|
|
|
return ""
|
|
|
|
|
|
def pick_col(df: pd.DataFrame, primary: str, secondary: str, default: str = "") -> pd.Series:
|
|
|
"""Fallback por fila: usa primary; si está vacío/NaN, toma secondary."""
|
|
|
n = len(df)
|
|
|
s1 = df[primary] if primary in df.columns else pd.Series([np.nan] * n, index=df.index)
|
|
|
s2 = df[secondary] if secondary in df.columns else pd.Series([default] * n, index=df.index)
|
|
|
s1 = s1.copy()
|
|
|
mask = s1.isna() | (s1.astype(str).str.strip() == "")
|
|
|
s1.loc[mask] = s2.loc[mask]
|
|
|
return s1.fillna(default)
|
|
|
|
|
|
def make_export(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
|
out = df.copy()
|
|
|
|
|
|
if "coverDate" in out.columns:
|
|
|
out["Year"] = pd.to_datetime(out["coverDate"], errors="coerce").dt.year
|
|
|
else:
|
|
|
out["Year"] = ""
|
|
|
|
|
|
out["Page start"], out["Page end"], out["Page count"] = "", "", ""
|
|
|
if "pages" in out.columns:
|
|
|
starts, ends, counts = [], [], []
|
|
|
for x in out["pages"].fillna(""):
|
|
|
if "-" in x:
|
|
|
a, b = x.split("-", 1)
|
|
|
a_num = "".join(ch for ch in a if ch.isdigit())
|
|
|
b_num = "".join(ch for ch in b if ch.isdigit())
|
|
|
starts.append(a_num); ends.append(b_num)
|
|
|
try:
|
|
|
counts.append(str(max(0, int(b_num) - int(a_num) + 1)) if a_num and b_num else "")
|
|
|
except Exception:
|
|
|
counts.append("")
|
|
|
else:
|
|
|
starts.append(""); ends.append(""); counts.append("")
|
|
|
out["Page start"], out["Page end"], out["Page count"] = starts, ends, counts
|
|
|
|
|
|
|
|
|
out["Link"] = out.apply(_pick_link, axis=1)
|
|
|
|
|
|
|
|
|
final = pd.DataFrame()
|
|
|
final["Title"] = out.get("title", "")
|
|
|
final["Year"] = out.get("Year", "")
|
|
|
|
|
|
final["Source title"] = pick_col(out, "sourceTitle", "prism:publicationName")
|
|
|
final["Volume"] = pick_col(out, "volume", "prism:volume")
|
|
|
final["Issue"] = pick_col(out, "issue", "prism:issueIdentifier")
|
|
|
|
|
|
final["Page start"] = out["Page start"]
|
|
|
final["Page end"] = out["Page end"]
|
|
|
final["Page count"] = out["Page count"]
|
|
|
|
|
|
final["Cited by"] = pick_col(out, "citedBy", "citedby-count")
|
|
|
final["DOI"] = pick_col(out, "doi", "prism:doi")
|
|
|
final["Link"] = out["Link"]
|
|
|
|
|
|
final["ISSN"] = pick_col(out, "issn", "prism:issn")
|
|
|
final["eISSN"] = pick_col(out, "eIssn", "prism:eIssn")
|
|
|
|
|
|
final["Document Type"] = pick_col(out, "subtypeDesc", "subtypeDescription")
|
|
|
final["Open Access"] = pick_col(out, "openAccess", "openaccessFlag")
|
|
|
|
|
|
final["EID"] = out.get("eid", "")
|
|
|
|
|
|
|
|
|
final["Year"] = pd.to_numeric(final["Year"], errors="coerce")
|
|
|
final = final.sort_values(by="Year", ascending=False, na_position="last")
|
|
|
|
|
|
|
|
|
return final[EXPORT_COLUMNS]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args():
|
|
|
p = argparse.ArgumentParser(description="Extrae publicaciones Scopus por AF-ID y exporta UN CSV básico (sin autores/abstract/etc.).")
|
|
|
p.add_argument("--api-key", required=True, help="X-ELS-APIKey")
|
|
|
p.add_argument("--insttoken", default=None, help="X-ELS-Insttoken (opcional)")
|
|
|
p.add_argument("--afid", default="60077378", help="Scopus Affiliation ID (AF-ID)")
|
|
|
p.add_argument("--year-start", default=2020, help="Año inicial o 'None'")
|
|
|
p.add_argument("--year-end", default=2024, help="Año final o 'None'")
|
|
|
p.add_argument("--view", default="STANDARD", choices=["BASIC", "STANDARD", "COMPLETE"], help="Vista del Search API")
|
|
|
p.add_argument("--page-size", type=int, default=100, help="Tamaño de página (25..200)")
|
|
|
p.add_argument("--use-cursor", action="store_true", help="Usar cursor pagination")
|
|
|
p.add_argument("--no-cursor", dest="use_cursor", action="store_false", help="Usar start/count")
|
|
|
p.set_defaults(use_cursor=True)
|
|
|
p.add_argument("--out-prefix", default="scopus_afid", help="Prefijo de salida")
|
|
|
return p.parse_args()
|
|
|
|
|
|
def main():
|
|
|
args = parse_args()
|
|
|
|
|
|
def norm_year(x):
|
|
|
sx = str(x).strip().lower()
|
|
|
return None if sx == "none" else int(x)
|
|
|
y0 = norm_year(args.year_start)
|
|
|
y1 = norm_year(args.year_end)
|
|
|
|
|
|
print("Descargando desde Scopus (Search API)…")
|
|
|
entries = fetch_scopus_affiliation(
|
|
|
api_key=args.api_key,
|
|
|
afid=args.afid,
|
|
|
year_start=y0,
|
|
|
year_end=y1,
|
|
|
view=args.view,
|
|
|
page_size=args.page_size,
|
|
|
insttoken=args.insttoken,
|
|
|
use_cursor=args.use_cursor
|
|
|
)
|
|
|
print(f"Entradas obtenidas: {len(entries)}")
|
|
|
|
|
|
df = normalize_entries(entries)
|
|
|
export_df = make_export(df)
|
|
|
out_csv = f"{args.out_prefix}_scopus_export.csv"
|
|
|
export_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
|
|
|
print(f"Listo: {out_csv}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|