|
|
import argparse
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import pyarrow as pa
|
|
|
import pyarrow.parquet as pq
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
def ensure_col(df: pd.DataFrame, name: str):
|
|
|
if name not in df.columns:
|
|
|
df[name] = ""
|
|
|
return df
|
|
|
|
|
|
def main():
|
|
|
p = argparse.ArgumentParser(description="Construye un Parquet con embeddings E5 para recomendación de revistas.")
|
|
|
p.add_argument("--csv", required=True, help="Ruta al CSV exportado (ej. uptc_afid60077378_scopus_export.csv)")
|
|
|
p.add_argument("--out", default="scopus_corpus.parquet", help="Ruta de salida Parquet")
|
|
|
p.add_argument("--model", default="intfloat/multilingual-e5-small", help="Modelo Sentence-Transformers")
|
|
|
p.add_argument("--batch-size", type=int, default=64, help="Tamaño de batch para el encode")
|
|
|
args = p.parse_args()
|
|
|
|
|
|
df = pd.read_csv(args.csv)
|
|
|
|
|
|
|
|
|
for c in ["Title","Source title","ISSN","eISSN","Year","Cited by","DOI","Link","EID","Document Type","Open Access"]:
|
|
|
ensure_col(df, c)
|
|
|
|
|
|
|
|
|
|
|
|
df["text_for_match"] = (
|
|
|
df["Title"].fillna("").astype(str).str.strip()
|
|
|
+ ". Revista: "
|
|
|
+ df["Source title"].fillna("").astype(str).str.strip()
|
|
|
).str.replace(r"\s+", " ", regex=True).str.strip()
|
|
|
|
|
|
|
|
|
print(f"Cargando modelo: {args.model}")
|
|
|
model = SentenceTransformer(args.model, device="cpu")
|
|
|
|
|
|
|
|
|
texts = ["passage: " + t if t else "passage: " for t in df["text_for_match"].tolist()]
|
|
|
print(f"Codificando {len(texts)} textos…")
|
|
|
embs = model.encode(
|
|
|
texts,
|
|
|
batch_size=args.batch_size,
|
|
|
show_progress_bar=True,
|
|
|
normalize_embeddings=True,
|
|
|
).astype(np.float32)
|
|
|
|
|
|
|
|
|
year = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
|
|
|
cited = pd.to_numeric(df["Cited by"], errors="coerce").fillna(0).astype(np.int32)
|
|
|
|
|
|
|
|
|
table = pa.table({
|
|
|
"eid": pa.array(df["EID"].astype(str).tolist()),
|
|
|
"title": pa.array(df["Title"].astype(str).tolist()),
|
|
|
"source_title": pa.array(df["Source title"].astype(str).tolist()),
|
|
|
"issn": pa.array(df["ISSN"].fillna("").astype(str).tolist()),
|
|
|
"eissn": pa.array(df["eISSN"].fillna("").astype(str).tolist()),
|
|
|
"year": pa.array(year.tolist(), type=pa.int64()),
|
|
|
"cited_by": pa.array(cited.tolist(), type=pa.int32()),
|
|
|
"doi": pa.array(df["DOI"].fillna("").astype(str).tolist()),
|
|
|
"link": pa.array(df["Link"].fillna("").astype(str).tolist()),
|
|
|
"Document Type": pa.array(df["Document Type"].astype(str).tolist()),
|
|
|
"Open Access": pa.array(df["Open Access"].astype(str).tolist()),
|
|
|
"text_for_match": pa.array(df["text_for_match"].tolist()),
|
|
|
"embedding": pa.array(embs.tolist(), type=pa.list_(pa.float32())),
|
|
|
})
|
|
|
|
|
|
pq.write_table(table, args.out, compression="zstd")
|
|
|
dim = len(embs[0]) if len(embs) else 0
|
|
|
print(f"OK -> {args.out} | filas: {table.num_rows} | dim: {dim}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|