Spaces:

valory
/

olas-prediction-live-dashboard

Running

App Files Files Community

cyberosa commited on Jun 4, 2024

Commit

99c38a1

1 Parent(s): 4d28ca6

cleaning, new notebooks and two months data logic

Browse files

Files changed (7) hide show

app.py +105 -110
notebooks/confidence_analysis.ipynb +0 -0
scripts/get_mech_info.py +89 -0
scripts/profitability.py +16 -16
scripts/pull_data.py +39 -22
scripts/tools.py +48 -136
scripts/utils.py +110 -0

app.py CHANGED Viewed

@@ -4,25 +4,25 @@ import pandas as pd
 import duckdb
 import logging
 from tabs.trades import (
-    prepare_trades,
-    get_overall_trades,
     get_overall_winning_trades,
     plot_trades_by_week,
     plot_winning_trades_by_week,
-    plot_trade_details
 )
 from tabs.tool_win import (
     get_tool_winning_rate,
     get_overall_winning_rate,
     plot_tool_winnings_overall,
-    plot_tool_winnings_by_tool
 )
 from tabs.error import (
-    get_error_data,
     get_error_data_overall,
     plot_error_data,
     plot_tool_error_data,
-    plot_week_error_data
 )
 from tabs.about import about_olas_predict
@@ -33,21 +33,25 @@ def get_logger():
     # stream handler and formatter
     stream_handler = logging.StreamHandler()
     stream_handler.setLevel(logging.DEBUG)
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     stream_handler.setFormatter(formatter)
     logger.addHandler(stream_handler)
     return logger
 logger = get_logger()
 def get_last_one_month_data():
     """
     Get the last one month data from the tools.parquet file
     """
     logger.info("Getting last one month data")
-    con = duckdb.connect(':memory:')
-    one_months_ago = (datetime.now() - timedelta(days=60)).strftime('%Y-%m-%d')
     # Query to fetch data from all_trades_profitability.parquet
     query2 = f"""
     SELECT *
@@ -69,19 +73,47 @@ def get_last_one_month_data():
     return df1, df2
 def prepare_data():
     """
     Prepare the data for the dashboard
     """
-    tools_df, trades_df = get_last_one_month_data()
-    tools_df['request_time'] = pd.to_datetime(tools_df['request_time'])
-    trades_df['creation_timestamp'] = pd.to_datetime(trades_df['creation_timestamp'])
     trades_df = prepare_trades(trades_df)
     return tools_df, trades_df
 tools_df, trades_df = prepare_data()
@@ -89,53 +121,39 @@ demo = gr.Blocks()
 INC_TOOLS = [
-    'prediction-online',
-    'prediction-offline',
-    'claude-prediction-online',
-    'claude-prediction-offline',
-    'prediction-offline-sme',
-    'prediction-online-sme',
-    'prediction-request-rag',
-    'prediction-request-reasoning',
-    'prediction-url-cot-claude',
-    'prediction-request-rag-claude',
-    'prediction-request-reasoning-claude'
 ]
-error_df = get_error_data(
-    tools_df=tools_df,
-    inc_tools=INC_TOOLS
-)
-error_overall_df = get_error_data_overall(
-    error_df=error_df
-)
-winning_rate_df = get_tool_winning_rate(
-    tools_df=tools_df,
-    inc_tools=INC_TOOLS
-)
-winning_rate_overall_df = get_overall_winning_rate(
-    wins_df=winning_rate_df
-)
-trades_count_df = get_overall_trades(
-    trades_df=trades_df
-)
-trades_winning_rate_df = get_overall_winning_trades(
-    trades_df=trades_df
-)
 with demo:
     gr.HTML("<h1>Olas Predict Actual Performance</h1>")
-    gr.Markdown("This app shows the actual performance of Olas Predict tools on the live market.")
     with gr.Tabs():
         with gr.TabItem("🔥Trades Dashboard"):
             with gr.Row():
                 gr.Markdown("# Plot of number of trades by week")
             with gr.Row():
-                trades_by_week_plot = plot_trades_by_week(
-                    trades_df=trades_count_df
-                )
             with gr.Row():
                 gr.Markdown("# Plot of winning trades by week")
             with gr.Row():
@@ -146,32 +164,30 @@ with demo:
                 gr.Markdown("# Plot of trade details")
             with gr.Row():
                 trade_details_selector = gr.Dropdown(
-                    label="Select a trade",
                     choices=[
                         "mech calls",
                         "collateral amount",
                         "earnings",
                         "net earnings",
-                        "ROI"
                     ],
-                    value="mech calls"
                 )
             with gr.Row():
                 trade_details_plot = plot_trade_details(
-                    trade_detail="mech calls",
-                    trades_df=trades_df
                 )
             def update_trade_details(trade_detail):
                 return plot_trade_details(
-                    trade_detail=trade_detail,
-                    trades_df=trades_df
                 )
             trade_details_selector.change(
-                update_trade_details,
-                inputs=trade_details_selector,
-                outputs=trade_details_plot
             )
             with gr.Row():
@@ -185,27 +201,25 @@ with demo:
             with gr.Row():
                 winning_selector = gr.Dropdown(
-                    label="Select Metric",
-                    choices=['losses', 'wins', 'total_request', 'win_perc'],
-                    value='win_perc',
                 )
             with gr.Row():
                 winning_plot = plot_tool_winnings_overall(
-                    wins_df=winning_rate_overall_df,
-                    winning_selector="win_perc"
                 )
             def update_tool_winnings_overall_plot(winning_selector):
                 return plot_tool_winnings_overall(
-                    wins_df=winning_rate_overall_df,
-                    winning_selector=winning_selector
                 )
             winning_selector.change(
                 update_tool_winnings_overall_plot,
-                inputs=winning_selector,
-                outputs=winning_plot
             )
             with gr.Row():
@@ -215,30 +229,24 @@ with demo:
             with gr.Row():
                 gr.Markdown("# Plot showing winning rate by tool")
             with gr.Row():
                 sel_tool = gr.Dropdown(
-                    label="Select a tool",
-                    choices=INC_TOOLS,
-                    value=INC_TOOLS[0]
                 )
             with gr.Row():
                 tool_winnings_by_tool_plot = plot_tool_winnings_by_tool(
-                    wins_df=winning_rate_df,
-                    tool=INC_TOOLS[0]
                 )
             def update_tool_winnings_by_tool_plot(tool):
-                return plot_tool_winnings_by_tool(
-                    wins_df=winning_rate_df,
-                    tool=tool
-                )
             sel_tool.change(
                 update_tool_winnings_by_tool_plot,
-                inputs=sel_tool,
-                outputs=tool_winnings_by_tool_plot
             )
             with gr.Row():
@@ -250,35 +258,24 @@ with demo:
             with gr.Row():
                 gr.Markdown("# Plot showing overall error")
             with gr.Row():
-                error_overall_plot = plot_error_data(
-                    error_all_df=error_overall_df
-                )
             with gr.Row():
                 gr.Markdown("# Plot showing error by tool")
             with gr.Row():
                 sel_tool = gr.Dropdown(
-                    label="Select a tool",
-                    choices=INC_TOOLS,
-                    value=INC_TOOLS[0]
                 )
             with gr.Row():
                 tool_error_plot = plot_tool_error_data(
-                    error_df=error_df,
-                    tool=INC_TOOLS[0]
                 )
             def update_tool_error_plot(tool):
-                return plot_tool_error_data(
-                    error_df=error_df,
-                    tool=tool
-                )
             sel_tool.change(
-                update_tool_error_plot,
-                inputs=sel_tool,
-                outputs=tool_error_plot
             )
             with gr.Row():
                 sel_tool
@@ -289,29 +286,27 @@ with demo:
                 gr.Markdown("# Plot showing error by week")
             with gr.Row():
-                choices = error_overall_df['request_month_year_week'].unique().tolist()
                 # sort the choices by the latest week to be on the top
                 choices = sorted(choices)
                 sel_week = gr.Dropdown(
-                    label="Select a week",
-                    choices=choices,
-                    value=choices[-1]
-                    )
             with gr.Row():
                 week_error_plot = plot_week_error_data(
-                    error_df=error_df,
-                    week=choices[-1]
                 )
             def update_week_error_plot(selected_week):
-                return plot_week_error_data(
-                    error_df=error_df,
-                    week=selected_week
-                )
-            sel_tool.change(update_tool_error_plot, inputs=sel_tool, outputs=tool_error_plot)
-            sel_week.change(update_week_error_plot, inputs=sel_week, outputs=week_error_plot)
             with gr.Row():
                 sel_tool

 import duckdb
 import logging
 from tabs.trades import (
+    prepare_trades,
+    get_overall_trades,
     get_overall_winning_trades,
     plot_trades_by_week,
     plot_winning_trades_by_week,
+    plot_trade_details,
 )
 from tabs.tool_win import (
     get_tool_winning_rate,
     get_overall_winning_rate,
     plot_tool_winnings_overall,
+    plot_tool_winnings_by_tool,
 )
 from tabs.error import (
+    get_error_data,
     get_error_data_overall,
     plot_error_data,
     plot_tool_error_data,
+    plot_week_error_data,
 )
 from tabs.about import about_olas_predict
     # stream handler and formatter
     stream_handler = logging.StreamHandler()
     stream_handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
     stream_handler.setFormatter(formatter)
     logger.addHandler(stream_handler)
     return logger
 logger = get_logger()
 def get_last_one_month_data():
     """
     Get the last one month data from the tools.parquet file
     """
     logger.info("Getting last one month data")
+    con = duckdb.connect(":memory:")
+    one_months_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
     # Query to fetch data from all_trades_profitability.parquet
     query2 = f"""
     SELECT *
     return df1, df2
+def get_all_data():
+    """
+    Get all data from the tools.parquet and all_trades_profitability.parquet files
+    """
+    logger.info("Getting all data")
+    con = duckdb.connect(":memory:")
+    # Query to fetch data from all_trades_profitability.parquet
+    query2 = f"""
+    SELECT *
+    FROM read_parquet('./data/all_trades_profitability.parquet')
+    """
+    df2 = con.execute(query2).fetchdf()
+    logger.info("Got all data from all_trades_profitability.parquet")
+    query1 = f"""
+    SELECT *
+    FROM read_parquet('./data/tools.parquet')
+    """
+    df1 = con.execute(query1).fetchdf()
+    logger.info("Got all data from tools.parquet")
+    con.close()
+    return df1, df2
 def prepare_data():
     """
     Prepare the data for the dashboard
     """
+    tools_df, trades_df = get_all_data()
+    tools_df["request_time"] = pd.to_datetime(tools_df["request_time"])
+    trades_df["creation_timestamp"] = pd.to_datetime(trades_df["creation_timestamp"])
     trades_df = prepare_trades(trades_df)
     return tools_df, trades_df
 tools_df, trades_df = prepare_data()
 INC_TOOLS = [
+    "prediction-online",
+    "prediction-offline",
+    "claude-prediction-online",
+    "claude-prediction-offline",
+    "prediction-offline-sme",
+    "prediction-online-sme",
+    "prediction-request-rag",
+    "prediction-request-reasoning",
+    "prediction-url-cot-claude",
+    "prediction-request-rag-claude",
+    "prediction-request-reasoning-claude",
 ]
+error_df = get_error_data(tools_df=tools_df, inc_tools=INC_TOOLS)
+error_overall_df = get_error_data_overall(error_df=error_df)
+winning_rate_df = get_tool_winning_rate(tools_df=tools_df, inc_tools=INC_TOOLS)
+winning_rate_overall_df = get_overall_winning_rate(wins_df=winning_rate_df)
+trades_count_df = get_overall_trades(trades_df=trades_df)
+trades_winning_rate_df = get_overall_winning_trades(trades_df=trades_df)
 with demo:
     gr.HTML("<h1>Olas Predict Actual Performance</h1>")
+    gr.Markdown(
+        "This app shows the actual performance of Olas Predict tools on the live market."
+    )
     with gr.Tabs():
         with gr.TabItem("🔥Trades Dashboard"):
             with gr.Row():
                 gr.Markdown("# Plot of number of trades by week")
             with gr.Row():
+                trades_by_week_plot = plot_trades_by_week(trades_df=trades_count_df)
             with gr.Row():
                 gr.Markdown("# Plot of winning trades by week")
             with gr.Row():
                 gr.Markdown("# Plot of trade details")
             with gr.Row():
                 trade_details_selector = gr.Dropdown(
+                    label="Select a trade",
                     choices=[
                         "mech calls",
                         "collateral amount",
                         "earnings",
                         "net earnings",
+                        "ROI",
                     ],
+                    value="mech calls",
                 )
             with gr.Row():
                 trade_details_plot = plot_trade_details(
+                    trade_detail="mech calls", trades_df=trades_df
                 )
             def update_trade_details(trade_detail):
                 return plot_trade_details(
+                    trade_detail=trade_detail, trades_df=trades_df
                 )
             trade_details_selector.change(
+                update_trade_details,
+                inputs=trade_details_selector,
+                outputs=trade_details_plot,
             )
             with gr.Row():
             with gr.Row():
                 winning_selector = gr.Dropdown(
+                    label="Select Metric",
+                    choices=["losses", "wins", "total_request", "win_perc"],
+                    value="win_perc",
                 )
             with gr.Row():
                 winning_plot = plot_tool_winnings_overall(
+                    wins_df=winning_rate_overall_df, winning_selector="win_perc"
                 )
             def update_tool_winnings_overall_plot(winning_selector):
                 return plot_tool_winnings_overall(
+                    wins_df=winning_rate_overall_df, winning_selector=winning_selector
                 )
             winning_selector.change(
                 update_tool_winnings_overall_plot,
+                inputs=winning_selector,
+                outputs=winning_plot,
             )
             with gr.Row():
             with gr.Row():
                 gr.Markdown("# Plot showing winning rate by tool")
             with gr.Row():
                 sel_tool = gr.Dropdown(
+                    label="Select a tool", choices=INC_TOOLS, value=INC_TOOLS[0]
                 )
             with gr.Row():
                 tool_winnings_by_tool_plot = plot_tool_winnings_by_tool(
+                    wins_df=winning_rate_df, tool=INC_TOOLS[0]
                 )
             def update_tool_winnings_by_tool_plot(tool):
+                return plot_tool_winnings_by_tool(wins_df=winning_rate_df, tool=tool)
             sel_tool.change(
                 update_tool_winnings_by_tool_plot,
+                inputs=sel_tool,
+                outputs=tool_winnings_by_tool_plot,
             )
             with gr.Row():
             with gr.Row():
                 gr.Markdown("# Plot showing overall error")
             with gr.Row():
+                error_overall_plot = plot_error_data(error_all_df=error_overall_df)
             with gr.Row():
                 gr.Markdown("# Plot showing error by tool")
             with gr.Row():
                 sel_tool = gr.Dropdown(
+                    label="Select a tool", choices=INC_TOOLS, value=INC_TOOLS[0]
                 )
             with gr.Row():
                 tool_error_plot = plot_tool_error_data(
+                    error_df=error_df, tool=INC_TOOLS[0]
                 )
             def update_tool_error_plot(tool):
+                return plot_tool_error_data(error_df=error_df, tool=tool)
             sel_tool.change(
+                update_tool_error_plot, inputs=sel_tool, outputs=tool_error_plot
             )
             with gr.Row():
                 sel_tool
                 gr.Markdown("# Plot showing error by week")
             with gr.Row():
+                choices = error_overall_df["request_month_year_week"].unique().tolist()
                 # sort the choices by the latest week to be on the top
                 choices = sorted(choices)
                 sel_week = gr.Dropdown(
+                    label="Select a week", choices=choices, value=choices[-1]
+                )
             with gr.Row():
                 week_error_plot = plot_week_error_data(
+                    error_df=error_df, week=choices[-1]
                 )
             def update_week_error_plot(selected_week):
+                return plot_week_error_data(error_df=error_df, week=selected_week)
+            sel_tool.change(
+                update_tool_error_plot, inputs=sel_tool, outputs=tool_error_plot
+            )
+            sel_week.change(
+                update_week_error_plot, inputs=sel_week, outputs=week_error_plot
+            )
             with gr.Row():
                 sel_tool

notebooks/confidence_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/get_mech_info.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from dataclasses import dataclass
+from string import Template
+from typing import Any
+from datetime import datetime, timedelta, UTC
+import requests
+MECH_SUBGRAPH_URL = "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
+SUBGRAPH_HEADERS = {
+    "Accept": "application/json, multipart/mixed",
+    "Content-Type": "application/json",
+}
+QUERY_BATCH_SIZE = 1000
+DATETIME_60_DAYS_AGO = datetime.now(UTC) - timedelta(days=60)
+BLOCK_NUMBER = Template(
+    """
+    {
+        blocks(
+            first: 1,
+            orderBy: timestamp,
+            orderDirection: asc,
+            where: {
+                timestamp_gte: "${timestamp_from}",
+                timestamp_lte: "${timestamp_to}"
+            }
+        ){
+            id
+        }
+    }
+    """
+)
+def fetch_block_number(timestamp_from: int, timestamp_to: int) -> dict:
+    """Get a block number by its timestamp margins."""
+    query = BLOCK_NUMBER.substitute(
+        timestamp_from=timestamp_from, timestamp_to=timestamp_to
+    )
+    # print(f"Sending query for the subgraph = {query}")
+    response = requests.post(
+        MECH_SUBGRAPH_URL,
+        headers=SUBGRAPH_HEADERS,
+        json={"query": query},
+        timeout=300,
+    )
+    result_json = response.json()
+    print(f"Response of the query={result_json}")
+    blocks = result_json.get("data", {}).get("blocks", "")
+    return blocks[0]
+def get_mech_info_last_60_days() -> dict[str, Any]:
+    """Query the subgraph to get the last 60 days of information from mech."""
+    timestamp_60_days_ago = int((DATETIME_60_DAYS_AGO).timestamp())
+    margin = timedelta(seconds=5)
+    timestamp_60_days_ago_plus_margin = int((DATETIME_60_DAYS_AGO + margin).timestamp())
+    last_month_block_number = fetch_block_number(
+        timestamp_60_days_ago, timestamp_60_days_ago_plus_margin
+    )
+    # expecting only one block
+    last_month_block_number = last_month_block_number.get("id", "")
+    if last_month_block_number.isdigit():
+        last_month_block_number = int(last_month_block_number)
+    if last_month_block_number == "":
+        raise ValueError("Could not find a valid block number for last month data")
+    MECH_TO_INFO = {
+        # this block number is when the creator had its first tx ever, and after this mech's creation
+        "0xff82123dfb52ab75c417195c5fdb87630145ae81": (
+            "old_mech_abi.json",
+            last_month_block_number,
+        ),
+        # this block number is when this mech was created
+        "0x77af31de935740567cf4ff1986d04b2c964a786a": (
+            "new_mech_abi.json",
+            last_month_block_number,
+        ),
+    }
+    return MECH_TO_INFO
+if __name__ == "__main__":
+    result = get_mech_info_last_60_days()
+    print(result)

scripts/profitability.py CHANGED Viewed

@@ -28,6 +28,7 @@ from enum import Enum
 from tqdm import tqdm
 import numpy as np
 from pathlib import Path
 IRRELEVANT_TOOLS = [
     "openai-text-davinci-002",
@@ -59,6 +60,7 @@ SCRIPTS_DIR = Path(__file__).parent
 ROOT_DIR = SCRIPTS_DIR.parent
 DATA_DIR = ROOT_DIR / "data"
 class MarketState(Enum):
     """Market state"""
@@ -343,7 +345,6 @@ def wei_to_unit(wei: int) -> float:
 def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
     """Returns whether the user has redeemed the position."""
     user_positions = user_json["data"]["user"]["userPositions"]
-    outcomes_tokens_traded = int(fpmmTrade["outcomeTokensTraded"])
     condition_id = fpmmTrade["fpmm.condition.id"]
     for position in user_positions:
@@ -358,12 +359,12 @@ def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
     return False
-def create_fpmmTrades(rpc: str):
     """Create fpmmTrades for all trades."""
     trades_json = _query_omen_xdai_subgraph(
-        from_timestamp=DEFAULT_FROM_TIMESTAMP,
         to_timestamp=DEFAULT_TO_TIMESTAMP,
-        fpmm_from_timestamp=DEFAULT_FROM_TIMESTAMP,
         fpmm_to_timestamp=DEFAULT_TO_TIMESTAMP,
     )
@@ -384,18 +385,14 @@ def create_fpmmTrades(rpc: str):
     # change creator to creator_address
     df.rename(columns={"creator": "trader_address"}, inplace=True)
-    # save to csv
-    df.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
     return df
 def prepare_profitalibity_data(rpc: str):
     """Prepare data for profitalibity analysis."""
-    # Check if tools.py is in the same directory
     try:
-        # load tools.csv
         tools = pd.read_parquet(DATA_DIR / "tools.parquet")
         # make sure creator_address is in the columns
@@ -412,16 +409,18 @@ def prepare_profitalibity_data(rpc: str):
         print("tools.parquet not found. Please run tools.py first.")
         return
-    # Check if fpmmTrades.csv is in the same directory
     try:
-        # load fpmmTrades.csv
         fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
         print("fpmmTrades.parquet loaded")
     except FileNotFoundError:
         print("fpmmTrades.parquet not found. Creating fpmmTrades.parquet...")
-        fpmmTrades = create_fpmmTrades(rpc)
         fpmmTrades.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
-        fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
     # make sure trader_address is in the columns
     assert "trader_address" in fpmmTrades.columns, "trader_address column not found"
@@ -468,7 +467,7 @@ def analyse_trader(
     # Iterate over the trades
     for i, trade in tqdm(trades.iterrows(), total=len(trades), desc="Analysing trades"):
         try:
-            if not trade['fpmm.currentAnswer']:
                 print(f"Skipping trade {i} because currentAnswer is NaN")
                 continue
             # Parsing and computing shared values
@@ -535,7 +534,8 @@ def analyse_trader(
                 "num_mech_calls": num_mech_calls,
                 "mech_fee_amount": num_mech_calls * DEFAULT_MECH_FEE,
                 "net_earnings": net_earnings,
-                "roi": net_earnings / (collateral_amount + fee_amount + num_mech_calls * DEFAULT_MECH_FEE),
             }
         except Exception as e:
@@ -613,7 +613,7 @@ def run_profitability_analysis(rpc):
     # load dfs from csv for analysis
     print("Preparing data...")
     fpmmTrades, tools = prepare_profitalibity_data(rpc)
-    tools['trader_address'] = tools['trader_address'].str.lower()
     # all trades profitability df
     print("Analysing trades...")

 from tqdm import tqdm
 import numpy as np
 from pathlib import Path
+from get_mech_info import DATETIME_60_DAYS_AGO
 IRRELEVANT_TOOLS = [
     "openai-text-davinci-002",
 ROOT_DIR = SCRIPTS_DIR.parent
 DATA_DIR = ROOT_DIR / "data"
 class MarketState(Enum):
     """Market state"""
 def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
     """Returns whether the user has redeemed the position."""
     user_positions = user_json["data"]["user"]["userPositions"]
     condition_id = fpmmTrade["fpmm.condition.id"]
     for position in user_positions:
     return False
+def create_fpmmTrades(rpc: str, from_timestamp: float = DEFAULT_FROM_TIMESTAMP):
     """Create fpmmTrades for all trades."""
     trades_json = _query_omen_xdai_subgraph(
+        from_timestamp=from_timestamp,
         to_timestamp=DEFAULT_TO_TIMESTAMP,
+        fpmm_from_timestamp=from_timestamp,
         fpmm_to_timestamp=DEFAULT_TO_TIMESTAMP,
     )
     # change creator to creator_address
     df.rename(columns={"creator": "trader_address"}, inplace=True)
     return df
 def prepare_profitalibity_data(rpc: str):
     """Prepare data for profitalibity analysis."""
+    # Check if tools.parquet is in the same directory
     try:
         tools = pd.read_parquet(DATA_DIR / "tools.parquet")
         # make sure creator_address is in the columns
         print("tools.parquet not found. Please run tools.py first.")
         return
+    # Check if fpmmTrades.parquet is in the same directory
     try:
         fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
         print("fpmmTrades.parquet loaded")
     except FileNotFoundError:
         print("fpmmTrades.parquet not found. Creating fpmmTrades.parquet...")
+        # Prepare the same time window as used for the tools
+        timestamp_60_days_ago = (DATETIME_60_DAYS_AGO).timestamp()
+        fpmmTrades = create_fpmmTrades(rpc, from_timestamp=timestamp_60_days_ago)
         fpmmTrades.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
+        # This is not needed
+        # fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
     # make sure trader_address is in the columns
     assert "trader_address" in fpmmTrades.columns, "trader_address column not found"
     # Iterate over the trades
     for i, trade in tqdm(trades.iterrows(), total=len(trades), desc="Analysing trades"):
         try:
+            if not trade["fpmm.currentAnswer"]:
                 print(f"Skipping trade {i} because currentAnswer is NaN")
                 continue
             # Parsing and computing shared values
                 "num_mech_calls": num_mech_calls,
                 "mech_fee_amount": num_mech_calls * DEFAULT_MECH_FEE,
                 "net_earnings": net_earnings,
+                "roi": net_earnings
+                / (collateral_amount + fee_amount + num_mech_calls * DEFAULT_MECH_FEE),
             }
         except Exception as e:
     # load dfs from csv for analysis
     print("Preparing data...")
     fpmmTrades, tools = prepare_profitalibity_data(rpc)
+    tools["trader_address"] = tools["trader_address"].str.lower()
     # all trades profitability df
     print("Analysing trades...")

scripts/pull_data.py CHANGED Viewed

@@ -19,6 +19,7 @@ from tools import (
     DEFAULT_FILENAME as TOOLS_FILENAME,
 )
 from profitability import run_profitability_analysis
 import gc
 logging.basicConfig(level=logging.INFO)
@@ -27,6 +28,7 @@ SCRIPTS_DIR = Path(__file__).parent
 ROOT_DIR = SCRIPTS_DIR.parent
 DATA_DIR = ROOT_DIR / "data"
 def get_question(text: str) -> str:
     """Get the question from a text."""
     # Regex to find text within double quotes
@@ -43,24 +45,26 @@ def get_question(text: str) -> str:
 def current_answer(text: str, fpmms: pd.DataFrame) -> Optional[str]:
     """Get the current answer for a question."""
-    row = fpmms[fpmms['title'] == text]
     if row.shape[0] == 0:
         return None
-    return row['currentAnswer'].values[0]
 def block_number_to_timestamp(block_number: int, web3: Web3) -> str:
     """Convert a block number to a timestamp."""
     block = web3.eth.get_block(block_number)
-    timestamp = datetime.utcfromtimestamp(block['timestamp'])
-    return timestamp.strftime('%Y-%m-%d %H:%M:%S')
 def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:
     """Parallelize the timestamp conversion."""
-    block_numbers = df['request_block'].tolist()
     with ThreadPoolExecutor(max_workers=10) as executor:
-        results = list(tqdm(executor.map(function, block_numbers), total=len(block_numbers)))
     return results
@@ -76,10 +80,11 @@ def weekly_analysis():
     # Run tools ETL
     logging.info("Running tools ETL")
     tools_etl(
         rpcs=[rpc],
         filename=TOOLS_FILENAME,
-        full_contents=True,
     )
     logging.info("Tools ETL completed")
@@ -98,35 +103,48 @@ def weekly_analysis():
     # Get the question from the tools
     logging.info("Getting the question and current answer for the tools")
-    tools['title'] = tools['prompt_request'].apply(lambda x: get_question(x))
-    tools['currentAnswer'] = tools['title'].apply(lambda x: current_answer(x, fpmms))
-    tools['currentAnswer'] = tools['currentAnswer'].str.replace('yes', 'Yes')
-    tools['currentAnswer'] = tools['currentAnswer'].str.replace('no', 'No')
     # Convert block number to timestamp
     logging.info("Converting block number to timestamp")
     t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
-    tools['request_time'] = tools['request_block'].map(t_map)
     # Identify tools with missing request_time and fill them
-    missing_time_indices = tools[tools['request_time'].isna()].index
     if not missing_time_indices.empty:
-        partial_block_number_to_timestamp = partial(block_number_to_timestamp, web3=web3)
-        missing_timestamps = parallelize_timestamp_conversion(tools.loc[missing_time_indices], partial_block_number_to_timestamp)
         # Update the original DataFrame with the missing timestamps
         for i, timestamp in zip(missing_time_indices, missing_timestamps):
-            tools.at[i, 'request_time'] = timestamp
-    tools['request_month_year'] = pd.to_datetime(tools['request_time']).dt.strftime('%Y-%m')
-    tools['request_month_year_week'] = pd.to_datetime(tools['request_time']).dt.to_period('W').astype(str)
-    # Save the tools
     tools.to_parquet(DATA_DIR / TOOLS_FILENAME, index=False)
     # Update t_map with new timestamps
-    new_timestamps = tools[['request_block', 'request_time']].dropna().set_index('request_block').to_dict()['request_time']
     t_map.update(new_timestamps)
     with open(DATA_DIR / "t_map.pkl", "wb") as f:
@@ -142,4 +160,3 @@ def weekly_analysis():
 if __name__ == "__main__":
     weekly_analysis()

     DEFAULT_FILENAME as TOOLS_FILENAME,
 )
 from profitability import run_profitability_analysis
 import gc
 logging.basicConfig(level=logging.INFO)
 ROOT_DIR = SCRIPTS_DIR.parent
 DATA_DIR = ROOT_DIR / "data"
 def get_question(text: str) -> str:
     """Get the question from a text."""
     # Regex to find text within double quotes
 def current_answer(text: str, fpmms: pd.DataFrame) -> Optional[str]:
     """Get the current answer for a question."""
+    row = fpmms[fpmms["title"] == text]
     if row.shape[0] == 0:
         return None
+    return row["currentAnswer"].values[0]
 def block_number_to_timestamp(block_number: int, web3: Web3) -> str:
     """Convert a block number to a timestamp."""
     block = web3.eth.get_block(block_number)
+    timestamp = datetime.utcfromtimestamp(block["timestamp"])
+    return timestamp.strftime("%Y-%m-%d %H:%M:%S")
 def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:
     """Parallelize the timestamp conversion."""
+    block_numbers = df["request_block"].tolist()
     with ThreadPoolExecutor(max_workers=10) as executor:
+        results = list(
+            tqdm(executor.map(function, block_numbers), total=len(block_numbers))
+        )
     return results
     # Run tools ETL
     logging.info("Running tools ETL")
+    # This etl is saving already the tools parquet file
     tools_etl(
         rpcs=[rpc],
         filename=TOOLS_FILENAME,
     )
     logging.info("Tools ETL completed")
     # Get the question from the tools
     logging.info("Getting the question and current answer for the tools")
+    tools["title"] = tools["prompt_request"].apply(lambda x: get_question(x))
+    tools["currentAnswer"] = tools["title"].apply(lambda x: current_answer(x, fpmms))
+    tools["currentAnswer"] = tools["currentAnswer"].str.replace("yes", "Yes")
+    tools["currentAnswer"] = tools["currentAnswer"].str.replace("no", "No")
     # Convert block number to timestamp
     logging.info("Converting block number to timestamp")
     t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
+    tools["request_time"] = tools["request_block"].map(t_map)
     # Identify tools with missing request_time and fill them
+    missing_time_indices = tools[tools["request_time"].isna()].index
     if not missing_time_indices.empty:
+        partial_block_number_to_timestamp = partial(
+            block_number_to_timestamp, web3=web3
+        )
+        missing_timestamps = parallelize_timestamp_conversion(
+            tools.loc[missing_time_indices], partial_block_number_to_timestamp
+        )
         # Update the original DataFrame with the missing timestamps
         for i, timestamp in zip(missing_time_indices, missing_timestamps):
+            tools.at[i, "request_time"] = timestamp
+    tools["request_month_year"] = pd.to_datetime(tools["request_time"]).dt.strftime(
+        "%Y-%m"
+    )
+    tools["request_month_year_week"] = (
+        pd.to_datetime(tools["request_time"]).dt.to_period("W").astype(str)
+    )
+    # Save the tools data after the updates on the content
     tools.to_parquet(DATA_DIR / TOOLS_FILENAME, index=False)
     # Update t_map with new timestamps
+    new_timestamps = (
+        tools[["request_block", "request_time"]]
+        .dropna()
+        .set_index("request_block")
+        .to_dict()["request_time"]
+    )
     t_map.update(new_timestamps)
     with open(DATA_DIR / "t_map.pkl", "wb") as f:
 if __name__ == "__main__":
     weekly_analysis()

scripts/tools.py CHANGED Viewed

@@ -20,22 +20,17 @@
 import json
 import os.path
 import re
-import sys
 import time
 import random
 from dataclasses import dataclass
 from enum import Enum
-from io import StringIO
 from typing import (
     Optional,
     List,
     Dict,
     Any,
     Union,
-    Callable,
-    Tuple,
 )
 import pandas as pd
 import requests
 from json.decoder import JSONDecodeError
@@ -56,8 +51,18 @@ from web3 import Web3, HTTPProvider
 from web3.exceptions import MismatchedABI
 from web3.types import BlockParams
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
 CONTRACTS_PATH = "contracts"
 MECH_TO_INFO = {
@@ -71,14 +76,11 @@ LATEST_BLOCK: Optional[int] = None
 LATEST_BLOCK_NAME: BlockParams = "latest"
 BLOCK_DATA_NUMBER = "number"
 BLOCKS_CHUNK_SIZE = 10_000
-REDUCE_FACTOR = 0.25
 EVENT_ARGUMENTS = "args"
 DATA = "data"
 REQUEST_ID = "requestId"
-REQUEST_ID_FIELD = "request_id"
 REQUEST_SENDER = "sender"
 PROMPT_FIELD = "prompt"
-BLOCK_FIELD = "block"
 CID_PREFIX = "f01701220"
 HTTP = "http://"
 HTTPS = HTTP[:4] + "s" + HTTP[4:]
@@ -89,7 +91,6 @@ STATUS_FORCELIST = [404, 500, 502, 503, 504]
 DEFAULT_FILENAME = "tools.parquet"
 RE_RPC_FILTER_ERROR = r"Filter with id: '\d+' does not exist."
 ABI_ERROR = "The event signature did not match the provided ABI"
-SLEEP = 0.5
 HTTP_TIMEOUT = 10
 N_IPFS_RETRIES = 1
 N_RPC_RETRIES = 100
@@ -109,13 +110,12 @@ IRRELEVANT_TOOLS = [
     "deepmind-optimization",
 ]
 # this is how frequently we will keep a snapshot of the progress so far in terms of blocks' batches
-# for example, the value 1 means that for every `BLOCKS_CHUNK_SIZE` blocks that we search, we also store the snapshot
 SNAPSHOT_RATE = 10
 NUM_WORKERS = 10
 GET_CONTENTS_BATCH_SIZE = 1000
-SCRIPTS_DIR = Path(__file__).parent
-ROOT_DIR = SCRIPTS_DIR.parent
-DATA_DIR = ROOT_DIR / "data"
 class MechEventName(Enum):
     """The mech's event names."""
@@ -289,31 +289,6 @@ EVENT_TO_MECH_STRUCT = {
 }
-def parse_args() -> str:
-    """Parse the arguments and return the RPC."""
-    if len(sys.argv) != 2:
-        raise ValueError("Expected the RPC as a positional argument.")
-    return sys.argv[1]
-def read_abi(abi_path: str) -> str:
-    """Read and return the wxDAI contract's ABI."""
-    with open(abi_path) as abi_file:
-        return abi_file.read()
-def reduce_window(contract_instance, event, from_block, batch_size, latest_block):
-    """Dynamically reduce the batch size window."""
-    keep_fraction = 1 - REDUCE_FACTOR
-    events_filter = contract_instance.events[event].build_filter()
-    events_filter.fromBlock = from_block
-    batch_size = int(batch_size * keep_fraction)
-    events_filter.toBlock = min(from_block + batch_size, latest_block)
-    tqdm.write(f"RPC timed out! Resizing batch size to {batch_size}.")
-    time.sleep(SLEEP)
-    return events_filter, batch_size
 def get_events(
     w3: Web3,
     event: str,
@@ -442,13 +417,6 @@ def request(
     return None
-def limit_text(text: str, limit: int = 200) -> str:
-    """Limit the given text"""
-    if len(text) > limit:
-        return f"{text[:limit]}..."
-    return text
 def parse_ipfs_response(
     session: requests.Session,
     url: str,
@@ -523,38 +491,12 @@ def get_contents(
     return pd.DataFrame(contents)
-def check_for_dicts(df: pd.DataFrame) -> List[str]:
-    """Check for columns that contain dictionaries."""
-    dict_columns = []
-    for column in df.columns:
-        if df[column].apply(lambda x: isinstance(x, dict)).any():
-            dict_columns.append(column)
-    return dict_columns
-def drop_dict_rows(df: pd.DataFrame,
-    dict_columns: List[str]) -> pd.DataFrame:
-    """Drop rows that contain dictionaries."""
-    for column in dict_columns:
-        df = df[~df[column].apply(lambda x: isinstance(x, dict))]
-    return df
-def clean(df: pd.DataFrame) -> pd.DataFrame:
-    """Clean the dataframe."""
-    dict_columns = check_for_dicts(df)
-    df = drop_dict_rows(df, dict_columns)
-    cleaned = df.drop_duplicates()
-    cleaned[REQUEST_ID_FIELD] = cleaned[REQUEST_ID_FIELD].astype("str")
-    return cleaned
 def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
     """Transform the requests dataframe."""
     return clean(contents)
-def transform_deliver(contents: pd.DataFrame, full_contents=False) -> pd.DataFrame:
     """Transform the delivers dataframe."""
     unpacked_result = pd.json_normalize(contents.result)
     # # drop result column if it exists
@@ -578,55 +520,27 @@ def transform_deliver(contents: pd.DataFrame, full_contents=False) -> pd.DataFra
     return clean(contents)
-def gen_event_filename(event_name: MechEventName) -> str:
-    """Generate the filename of an event."""
-    return f"{event_name.value.lower()}s.parquet"
-def read_n_last_lines(filename: str, n: int = 1) -> str:
-    """Return the `n` last lines' content of a file."""
-    num_newlines = 0
-    with open(filename, "rb") as f:
-        try:
-            f.seek(-2, os.SEEK_END)
-            while num_newlines < n:
-                f.seek(-2, os.SEEK_CUR)
-                if f.read(1) == b"\n":
-                    num_newlines += 1
-        except OSError:
-            f.seek(0)
-        last_line = f.readline().decode()
-    return last_line
-def get_earliest_block(event_name: MechEventName) -> int:
-    """Get the earliest block number to use when filtering for events."""
-    filename = gen_event_filename(event_name)
-    if not os.path.exists(DATA_DIR / filename):
-        return 0
-    df = pd.read_parquet(DATA_DIR / filename)
-    block_field = f"{event_name.value.lower()}_{BLOCK_FIELD}"
-    return int(df[block_field].max())
 def store_progress(
     filename: str,
     event_to_contents: Dict[str, pd.DataFrame],
     tools: pd.DataFrame,
 ) -> None:
     """Store the given progress."""
-    print("starting")
     if filename:
         DATA_DIR.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
         for event_name, content in event_to_contents.items():
-            event_filename = gen_event_filename(event_name)  # Ensure this function returns a valid filename string
             try:
                 if "result" in content.columns:
-                    content = content.drop(columns=["result"])  # Avoid in-place modification
                 content.to_parquet(DATA_DIR / event_filename, index=False)
             except Exception as e:
-                print(f"Failed to write {event_name}: {e}")
         # Drop result and error columns for tools DataFrame
         try:
             if "result" in tools.columns:
@@ -637,7 +551,8 @@ def store_progress(
 def etl(
-    rpcs: List[str], filename: Optional[str] = None, full_contents: bool = True
 ) -> pd.DataFrame:
     """Fetch from on-chain events, process, store and return the tools' results on all the questions as a Dataframe."""
     w3s = [Web3(HTTPProvider(r)) for r in rpcs]
@@ -646,13 +561,15 @@ def etl(
         MechEventName.REQUEST: transform_request,
         MechEventName.DELIVER: transform_deliver,
     }
     mech_to_info = {
         to_checksum_address(address): (
             os.path.join(CONTRACTS_PATH, filename),
             earliest_block,
         )
-        for address, (filename, earliest_block) in MECH_TO_INFO.items()
     }
     event_to_contents = {}
     latest_block = LATEST_BLOCK
@@ -663,17 +580,13 @@ def etl(
     # Loop through events in event_to_transformer
     for event_name, transformer in event_to_transformer.items():
-        if next_start_block is None:
-            next_start_block_base = get_earliest_block(event_name)
         # Loop through mech addresses in mech_to_info
         events = []
         for address, (abi, earliest_block) in mech_to_info.items():
-            if next_start_block_base == 0:
-                next_start_block = earliest_block
-            else:
-                next_start_block = next_start_block_base
             print(
                 f"Searching for {event_name.value} events for mech {address} from block {next_start_block} to {latest_block}."
             )
@@ -704,6 +617,7 @@ def etl(
                     current_mech_events = future.result()
                     events.extend(current_mech_events)
         parsed = parse_events(events)
         contents = []
@@ -729,31 +643,28 @@ def etl(
         contents = pd.concat(contents, ignore_index=True)
-        full_contents = True
-        if event_name == MechEventName.REQUEST:
-            transformed = transformer(contents)
-        elif event_name == MechEventName.DELIVER:
-            transformed = transformer(contents, full_contents=full_contents)
-        events_filename = gen_event_filename(event_name)
-        if os.path.exists(DATA_DIR / events_filename):
-            old = pd.read_parquet(DATA_DIR / events_filename)
-            # Reset index to avoid index conflicts
-            old.reset_index(drop=True, inplace=True)
-            transformed.reset_index(drop=True, inplace=True)
-            # Concatenate DataFrames
-            transformed = pd.concat([old, transformed], ignore_index=True)
-            # Drop duplicates if necessary
-            transformed.drop_duplicates(subset=REQUEST_ID_FIELD, inplace=True)
         event_to_contents[event_name] = transformed.copy()
     # Store progress
     tools = pd.merge(*event_to_contents.values(), on=REQUEST_ID_FIELD)
     store_progress(filename, event_to_contents, tools)
     return tools
@@ -763,5 +674,6 @@ if __name__ == "__main__":
     RPCs = [
         "https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a",
     ]
-    tools = etl(rpcs=RPCs, filename=DEFAULT_FILENAME, full_contents=True)

 import json
 import os.path
 import re
 import time
 import random
 from dataclasses import dataclass
 from enum import Enum
 from typing import (
     Optional,
     List,
     Dict,
     Any,
     Union,
 )
 import pandas as pd
 import requests
 from json.decoder import JSONDecodeError
 from web3.exceptions import MismatchedABI
 from web3.types import BlockParams
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from get_mech_info import get_mech_info_last_60_days
+from utils import (
+    clean,
+    BLOCK_FIELD,
+    gen_event_filename,
+    read_abi,
+    SLEEP,
+    reduce_window,
+    limit_text,
+    DATA_DIR,
+    REQUEST_ID_FIELD,
+)
 CONTRACTS_PATH = "contracts"
 MECH_TO_INFO = {
 LATEST_BLOCK_NAME: BlockParams = "latest"
 BLOCK_DATA_NUMBER = "number"
 BLOCKS_CHUNK_SIZE = 10_000
 EVENT_ARGUMENTS = "args"
 DATA = "data"
 REQUEST_ID = "requestId"
 REQUEST_SENDER = "sender"
 PROMPT_FIELD = "prompt"
 CID_PREFIX = "f01701220"
 HTTP = "http://"
 HTTPS = HTTP[:4] + "s" + HTTP[4:]
 DEFAULT_FILENAME = "tools.parquet"
 RE_RPC_FILTER_ERROR = r"Filter with id: '\d+' does not exist."
 ABI_ERROR = "The event signature did not match the provided ABI"
 HTTP_TIMEOUT = 10
 N_IPFS_RETRIES = 1
 N_RPC_RETRIES = 100
     "deepmind-optimization",
 ]
 # this is how frequently we will keep a snapshot of the progress so far in terms of blocks' batches
+# for example, the value 1 means that for every `BLOCKS_CHUNK_SIZE` blocks that we search,
+#  we also store the snapshot
 SNAPSHOT_RATE = 10
 NUM_WORKERS = 10
 GET_CONTENTS_BATCH_SIZE = 1000
 class MechEventName(Enum):
     """The mech's event names."""
 }
 def get_events(
     w3: Web3,
     event: str,
     return None
 def parse_ipfs_response(
     session: requests.Session,
     url: str,
     return pd.DataFrame(contents)
 def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
     """Transform the requests dataframe."""
     return clean(contents)
+def transform_deliver(contents: pd.DataFrame) -> pd.DataFrame:
     """Transform the delivers dataframe."""
     unpacked_result = pd.json_normalize(contents.result)
     # # drop result column if it exists
     return clean(contents)
 def store_progress(
     filename: str,
     event_to_contents: Dict[str, pd.DataFrame],
     tools: pd.DataFrame,
 ) -> None:
     """Store the given progress."""
+    print("storing given progress")
     if filename:
         DATA_DIR.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
         for event_name, content in event_to_contents.items():
+            event_filename = gen_event_filename(
+                event_name
+            )  # Ensure this function returns a valid filename string
             try:
                 if "result" in content.columns:
+                    content = content.drop(
+                        columns=["result"]
+                    )  # Avoid in-place modification
                 content.to_parquet(DATA_DIR / event_filename, index=False)
             except Exception as e:
+                print(f"Failed to write {event_name} data: {e}")
         # Drop result and error columns for tools DataFrame
         try:
             if "result" in tools.columns:
 def etl(
+    rpcs: List[str],
+    filename: Optional[str] = None,
 ) -> pd.DataFrame:
     """Fetch from on-chain events, process, store and return the tools' results on all the questions as a Dataframe."""
     w3s = [Web3(HTTPProvider(r)) for r in rpcs]
         MechEventName.REQUEST: transform_request,
         MechEventName.DELIVER: transform_deliver,
     }
     mech_to_info = {
         to_checksum_address(address): (
             os.path.join(CONTRACTS_PATH, filename),
             earliest_block,
         )
+        for address, (filename, earliest_block) in get_mech_info_last_60_days().items()
     }
     event_to_contents = {}
     latest_block = LATEST_BLOCK
     # Loop through events in event_to_transformer
     for event_name, transformer in event_to_transformer.items():
+        # if next_start_block is None:
+        #     next_start_block_base = get_earliest_block(event_name)
         # Loop through mech addresses in mech_to_info
         events = []
         for address, (abi, earliest_block) in mech_to_info.items():
+            next_start_block = earliest_block
             print(
                 f"Searching for {event_name.value} events for mech {address} from block {next_start_block} to {latest_block}."
             )
                     current_mech_events = future.result()
                     events.extend(current_mech_events)
+        print("Parsing events")
         parsed = parse_events(events)
         contents = []
         contents = pd.concat(contents, ignore_index=True)
+        transformed = transformer(contents)
+        # Remove appending data, always new files
+        # if os.path.exists(DATA_DIR / events_filename):
+        #     old = pd.read_parquet(DATA_DIR / events_filename)
+        #     # Reset index to avoid index conflicts
+        #     old.reset_index(drop=True, inplace=True)
+        #     transformed.reset_index(drop=True, inplace=True)
+        #     # Concatenate DataFrames
+        #     transformed = pd.concat([old, transformed], ignore_index=True)
+        #     # Drop duplicates if necessary
+        #     transformed.drop_duplicates(subset=REQUEST_ID_FIELD, inplace=True)
         event_to_contents[event_name] = transformed.copy()
     # Store progress
     tools = pd.merge(*event_to_contents.values(), on=REQUEST_ID_FIELD)
+    print(tools.info())
     store_progress(filename, event_to_contents, tools)
     return tools
     RPCs = [
         "https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a",
     ]
+    filename = DEFAULT_FILENAME
+    tools = etl(rpcs=RPCs, filename=filename)

scripts/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import sys
+import os
+import time
+from tqdm import tqdm
+from tools import MechEventName
+from typing import List
+import pandas as pd
+import gc
+from pathlib import Path
+REDUCE_FACTOR = 0.25
+SLEEP = 0.5
+REQUEST_ID_FIELD = "request_id"
+SCRIPTS_DIR = Path(__file__).parent
+ROOT_DIR = SCRIPTS_DIR.parent
+DATA_DIR = ROOT_DIR / "data"
+BLOCK_FIELD = "block"
+def parse_args() -> str:
+    """Parse the arguments and return the RPC."""
+    if len(sys.argv) != 2:
+        raise ValueError("Expected the RPC as a positional argument.")
+    return sys.argv[1]
+def read_abi(abi_path: str) -> str:
+    """Read and return the wxDAI contract's ABI."""
+    with open(abi_path) as abi_file:
+        return abi_file.read()
+def reduce_window(contract_instance, event, from_block, batch_size, latest_block):
+    """Dynamically reduce the batch size window."""
+    keep_fraction = 1 - REDUCE_FACTOR
+    events_filter = contract_instance.events[event].build_filter()
+    events_filter.fromBlock = from_block
+    batch_size = int(batch_size * keep_fraction)
+    events_filter.toBlock = min(from_block + batch_size, latest_block)
+    tqdm.write(f"RPC timed out! Resizing batch size to {batch_size}.")
+    time.sleep(SLEEP)
+    return events_filter, batch_size
+def limit_text(text: str, limit: int = 200) -> str:
+    """Limit the given text"""
+    if len(text) > limit:
+        return f"{text[:limit]}..."
+    return text
+def check_for_dicts(df: pd.DataFrame) -> List[str]:
+    """Check for columns that contain dictionaries."""
+    dict_columns = []
+    for column in df.columns:
+        if df[column].apply(lambda x: isinstance(x, dict)).any():
+            dict_columns.append(column)
+    return dict_columns
+def drop_dict_rows(df: pd.DataFrame, dict_columns: List[str]) -> pd.DataFrame:
+    """Drop rows that contain dictionaries."""
+    for column in dict_columns:
+        df = df[~df[column].apply(lambda x: isinstance(x, dict))]
+    return df
+def clean(df: pd.DataFrame) -> pd.DataFrame:
+    """Clean the dataframe."""
+    dict_columns = check_for_dicts(df)
+    df = drop_dict_rows(df, dict_columns)
+    cleaned = df.drop_duplicates()
+    cleaned[REQUEST_ID_FIELD] = cleaned[REQUEST_ID_FIELD].astype("str")
+    return cleaned
+def gen_event_filename(event_name: MechEventName) -> str:
+    """Generate the filename of an event."""
+    return f"{event_name.value.lower()}s.parquet"
+def read_n_last_lines(filename: str, n: int = 1) -> str:
+    """Return the `n` last lines' content of a file."""
+    num_newlines = 0
+    with open(filename, "rb") as f:
+        try:
+            f.seek(-2, os.SEEK_END)
+            while num_newlines < n:
+                f.seek(-2, os.SEEK_CUR)
+                if f.read(1) == b"\n":
+                    num_newlines += 1
+        except OSError:
+            f.seek(0)
+        last_line = f.readline().decode()
+    return last_line
+def get_earliest_block(event_name: MechEventName) -> int:
+    """Get the earliest block number to use when filtering for events."""
+    filename = gen_event_filename(event_name)
+    if not os.path.exists(DATA_DIR / filename):
+        return 0
+    df = pd.read_parquet(DATA_DIR / filename)
+    block_field = f"{event_name.value.lower()}_{BLOCK_FIELD}"
+    earliest_block = int(df[block_field].max())
+    # clean and release all memory
+    del df
+    gc.collect()
+    return earliest_block