Spaces:

davanstrien
/

huggingface-datasets-search-v2

Running on CPU Upgrade

App Files Files Community

davanstrien HF Staff commited on Aug 14, 2024

Commit

e35e532

1 Parent(s): a8bda03

chore: Filter out unmodified template cards in load_cards()

Browse files

Files changed (1) hide show

load_data.py +45 -9

load_data.py CHANGED Viewed

@@ -12,7 +12,10 @@ from chromadb.utils import embedding_functions
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
 from tqdm.contrib.concurrent import thread_map
 # Set up logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -67,15 +70,20 @@ def get_collection(chroma_client, embedding_function):
 def get_last_modified_in_collection(collection) -> datetime | None:
     logger.info("Fetching last modified date from collection")
-    all_items = collection.get(include=["metadatas"])
-    if last_modified := [
-        datetime.fromisoformat(item["last_modified"]) for item in all_items["metadatas"]
-    ]:
-        last_mod = max(last_modified)
-        logger.info(f"Last modified date: {last_mod}")
-        return last_mod
-    else:
-        logger.info("No last modified date found")
         return None
@@ -106,6 +114,26 @@ def parse_markdown_column(
     )
 def load_cards(
     min_len: int = 50,
     min_likes: int | None = None,
@@ -122,6 +150,14 @@ def load_cards(
         df = df.filter(pl.col("likes") > min_likes)
     if last_modified:
         df = df.filter(pl.col("last_modified") > last_modified)
     if len(df) == 0:
         logger.info("No cards found matching criteria")
         return None

 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
 from tqdm.contrib.concurrent import thread_map
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Set up logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 def get_last_modified_in_collection(collection) -> datetime | None:
     logger.info("Fetching last modified date from collection")
+    try:
+        all_items = collection.get(include=["metadatas"])
+        if last_modified := [
+            datetime.fromisoformat(item["last_modified"])
+            for item in all_items["metadatas"]
+        ]:
+            last_mod = max(last_modified)
+            logger.info(f"Last modified date: {last_mod}")
+            return last_mod
+        else:
+            logger.info("No last modified date found")
+            return None
+    except Exception as e:
+        logger.error(f"Error fetching last modified date: {str(e)}")
         return None
     )
+def is_unmodified_template(card: str) -> bool:
+    # Check for a combination of template-specific phrases
+    template_indicators = [
+        "# Dataset Card for Dataset Name",
+        "<!-- Provide a quick summary of the dataset. -->",
+        "This dataset card aims to be a base template for new datasets",
+        "[More Information Needed]",
+    ]
+    # Count how many indicators are present
+    indicator_count = sum(indicator in card for indicator in template_indicators)
+    # Check if the card contains a high number of "[More Information Needed]" occurrences
+    more_info_needed_count = card.count("[More Information Needed]")
+    # Consider it an unmodified template if it has most of the indicators
+    # and a high number of "[More Information Needed]" occurrences
+    return indicator_count >= 3 or more_info_needed_count >= 7
 def load_cards(
     min_len: int = 50,
     min_likes: int | None = None,
         df = df.filter(pl.col("likes") > min_likes)
     if last_modified:
         df = df.filter(pl.col("last_modified") > last_modified)
+    # Filter out unmodified template cards
+    df = df.filter(
+        ~pl.col("prepended_markdown").map_elements(
+            is_unmodified_template, return_dtype=pl.Boolean
+        )
+    )
     if len(df) == 0:
         logger.info("No cards found matching criteria")
         return None