Eval_Cards

Sleeping

App Files Files Community

evijit HF Staff commited on Dec 7, 2024

Commit

a226d69

verified ·

1 Parent(s): fcd3e1e

Update model_data/model_a_data.json

Browse files

Files changed (1) hide show

model_data/model_a_data.json +304 -174

model_data/model_a_data.json CHANGED Viewed

@@ -1,219 +1,349 @@
 {
   "metadata": {
-    "Name": "Model A",
-    "Provider": "TechCorp",
-    "Version": "2.1",
-    "Release Date": "2023-09-15",
     "Type": "Large Language Model",
     "Modalities": ["Text-to-Text"]
   },
   "scores": {
     "Bias, Stereotypes, and Representational Harms Evaluation": {
-      "Comprehensive Evaluation Methodology": {
         "status": "Yes",
-        "source": "Both",
-        "applicable_evaluations": [
-          "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
-          "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
-        ]
       },
-      "Inclusive Protected Class Consideration": {
         "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
-          "Consideration of intersectionality and how identity aspects interact",
-          "Assessment of potential harms to non-typical groups (e.g., by profession or hobbies)"
-        ]
-      },
-      "Cultural and Linguistic Diversity": {
-        "status": "Yes",
-        "source": "3P",
-        "applicable_evaluations": [
-          "Tests of model performance and biases across languages and cultures",
-          "Consideration of how protected categories may shift in meaning across regions"
-        ]
       },
-      "Stereotype and Harmful Association Detection": {
         "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Detection of stereotypical word associations in text models",
-          "Sentiment analysis and toxicity measurements, especially regarding specific groups"
-        ]
       },
-      "Performance Disparities Assessment": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
-          "Performance analysis for disadvantaged subgroups",
-          "Intersectionality considerations in performance analysis"
-        ]
       }
     },
     "Cultural Values and Sensitive Content Evaluation": {
-      "Hate Speech and Toxicity Evaluation": {
-        "status": "Yes",
-        "source": "Both",
-        "applicable_evaluations": [
-          "Assessments of harmful text generation",
-          "Evaluations of toxicity, hurtfulness, or offensiveness"
-        ]
       },
-      "Cultural Value Representation": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
-          "Inductive and participatory evaluations grounded in specific cultural contexts",
-          "Assessments of ethical scenarios and political value representation"
-        ]
-      },
-      "Diverse Cultural Context": {
         "status": "Yes",
-        "source": "3P",
-        "applicable_evaluations": [
-          "Assessments that don't equate nationality with cultural context",
-          "Representation of differing cultural values within countries"
-        ]
       }
     },
     "Disparate Performance": {
-      "Subpopulation Performance Analysis": {
-        "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
-          "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
-        ]
       },
-      "Cross-lingual and Dialect Evaluation": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Cross-lingual prompting on standard benchmarks",
-          "Examination of performance across dialects",
-          "Analysis of hallucination disparity across languages"
-        ]
-      },
-      "Image Generation Quality Assessment": {
         "status": "N/A",
-        "source": null,
-        "applicable_evaluations": []
       }
     },
     "Environmental Costs and Carbon Emissions Evaluation": {
-      "Energy Consumption Measurement": {
         "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Measurement of energy used in training, testing, and deploying the system",
-          "Evaluation of compute power consumption"
-        ]
       },
-      "Carbon Footprint Quantification": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Use of tools like CodeCarbon or Carbontracker",
-          "Measurement of carbon emissions for training and inference",
-          "Conversion of energy consumption to carbon emissions"
-        ]
-      },
-      "Hardware Resource Evaluation": {
         "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Assessment of CPU, GPU, and TPU usage",
-          "Measurement of FLOPS (Floating Point Operations)"
-        ]
       }
     },
     "Privacy and Data Protection Evaluation": {
-      "Data Minimization and Consent Practices": {
         "status": "Yes",
-        "source": "Both",
-        "applicable_evaluations": [
-          "Implementation of data minimization practices",
-          "Use of opt-in data collection methods",
-          "Assessment of active consent for collecting, processing, and sharing data"
-        ]
-      },
-      "Memorization and Data Leakage Evaluation": {
         "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Examination of the maximum amount of discoverable information given training data",
-          "Evaluation of extractable information without training data access"
-        ]
       },
-      "Personal Information Revelation Assessment": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Direct prompting tests to reveal Personally Identifiable Information (PII)",
-          "Use of tools like ProPILE to audit PII revelation likelihood",
-          "Evaluation of the system's ability to infer personal attributes"
-        ]
       }
     },
     "Financial Costs Evaluation": {
-      "Comprehensive Cost Evaluation": {
-        "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Estimation of infrastructure and hardware costs",
-          "Calculation of labor hours from researchers, developers, and crowd workers",
-          "Tracking of compute costs using low-cost or standard pricing per instance-hour"
-        ]
-      },
-      "Storage and Training Cost Analysis": {
-        "status": "Yes",
-        "source": "1P",
-        "applicable_evaluations": [
-          "Assessment of storage costs for both datasets and resulting models",
-          "Consideration of in-house vs. cloud storage options",
-          "Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
-        ]
-      },
-      "Hosting and Inference Cost Evaluation": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Evaluation of low-latency serving costs",
-          "Assessment of inference costs based on token usage",
-          "Consideration of factors such as initial prompt length and requested token response length"
-        ]
       }
     },
     "Data and Content Moderation Labor Evaluation": {
-      "Crowdwork Standards Compliance": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Assessment of compliance with Criteria for Fairer Microwork",
-          "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
-          "Comparison with Oxford Internet Institute's Fairwork Principles"
-        ]
-      },
-      "Crowdworker Demographics and Compensation": {
         "status": "Yes",
-        "source": "3P",
-        "applicable_evaluations": [
-          "Documentation of crowd workers' demographics",
-          "Transparency in reporting instructions given to crowdworkers",
-          "Assessment of how crowdworkers were evaluated and compensated"
-        ]
-      },
-      "Psychological Support and Content Exposure": {
-        "status": "No",
-        "source": null,
-        "applicable_evaluations": [
-          "Documentation of immediate trauma support availability",
-          "Assessment of long-term professional psychological support provision",
-          "Evaluation of practices for controlling exposure to traumatic material"
-        ]
       }
     }
   }

 {
   "metadata": {
+    "Name": "StarCoder2",
+    "Provider": "BigCode",
+    "URL": "https://huggingface.co/bigcode/starcoder2-15b",
     "Type": "Large Language Model",
     "Modalities": ["Text-to-Text"]
   },
   "scores": {
     "Bias, Stereotypes, and Representational Harms Evaluation": {
+      "1.1 Bias Detection Overview": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "BOLD - Bias in Open-ended Language Generation Dataset"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "WinoBias"
+          }
+        ],
+        "questions": {
+          "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
+          "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
+          "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
+          "Have evaluations been run across all applicable modalities": true,
+          "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
+          "Have bias evaluations been run with human participants?": false
+        }
       },
+      "1.2 Protected Classes and Intersectional Measures": {
         "status": "No",
+        "sources": [],
+        "questions": {
+          "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
+          "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
+          "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
+          "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
+        }
       },
+      "1.3 Measurement of Stereotypes and Harmful Associations": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "HONEST - Hurtful Sentence Completion in English Language Models"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "RealToxicityPrompts"
+          }
+        ],
+        "questions": {
+          "Measurement of known stereotypes in AI system outputs": true,
+          "Measurement of other negative associations and assumptions regarding specific groups": true,
+          "Measurement of stereotypes and negative associations across in-scope contexts": false
+        }
       },
+      "1.4 Bias Evaluation Transparency and Documentation": {
+        "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "Evaluation Documentation"
+          }
+        ],
+        "questions": {
+          "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
+          "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
+          "Documentation of bias mitigation measures, including their secondary impacts": false,
+          "Documentation of bias monitoring approaches post-release/deployment if applicable": false
+        }
       }
     },
     "Cultural Values and Sensitive Content Evaluation": {
+      "2.1 Cultural Variation Overview": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
       },
+      "2.2 Cultural Diversity and Representation": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "2.3 Generated Sensitive Content across Cultural Contexts": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "HONEST - Hurtful Sentence Completion in English Language Models"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "RealToxicityPrompts"
+          }
+        ],
+        "questions": {
+          "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
+          "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
+          "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
+          "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions": false,
+          "Has the AI system been evaluated for its likelihood of exposing its direct users to categories of content that might be inappropriate": true,
+          "Has the AI system been evaluated for its likelihood of exposing its direct users to content that might have additional negative psychological impacts": false,
+          "Has the evaluation of the AI system's behaviors explicitly considered cultural variation": false
+        }
+      },
+      "2.4 Cultural Variation Transparency and Documentation": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
       }
     },
     "Disparate Performance": {
+      "3.1 Disparate Performance Overview": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
       },
+      "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "3.3 Subgroup Performance Analysis": {
         "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "3.4 Disparate Performance Evaluation Transparency and Documentation": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
       }
     },
     "Environmental Costs and Carbon Emissions Evaluation": {
+      "4.1 Environmental Costs Overview": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://mlco2.github.io/impact/#compute",
+            "name": "Machine Learning Emissions Calculator"
+          }
+        ],
+        "questions": {
+          "Evaluations of different processes within development and deployment": false,
+          "Have evaluations been run across all applicable modalities?": true,
+          "Have evaluations been run on standardized benchmarks or metrics?": true,
+          "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
+          "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
+        }
       },
+      "4.2 Energy Cost and Environmental Impact of Development": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://mlco2.github.io/impact/#compute",
+            "name": "Machine Learning Emissions Calculator"
+          }
+        ],
+        "questions": {
+          "Accounting of FLOPS across development stages": true,
+          "Evaluation of energy consumption using standardized tracking tools": true,
+          "Evaluation of carbon impact accounting for regional energy sources": true,
+          "Evaluation of hardware lifecycle environmental impact": false
+        }
+      },
+      "4.3 Energy Cost and Environmental Impact of Deployment": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "4.4 Environmental Costs Transparency and Documentation": {
+        "status": "Yes",
+        "sources": [
+          {
+            "type": "🌐",
+            "detail": "https://mlco2.github.io/impact/#compute",
+            "name": "Machine Learning Emissions Calculator"
+          }
+        ],
+        "questions": {
+          "Documentation about equipment and infrastructure specifications": true,
+          "Sufficient documentation of evaluation methods including components covered": false,
+          "Sufficient documentation of evaluation methods to replicate findings": true,
+          "Sufficient documentation of evaluation results for comparison": true
+        }
       }
     },
     "Privacy and Data Protection Evaluation": {
+      "5.1 Privacy and Data Protection Overview": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "PII detection and redaction using an NER model"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
+            "name": "Opt-out tool for users"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "Asleep at the Keyboard Security Benchmark"
+          }
+        ],
+        "questions": {
+          "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
+          "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
+          "Have extrinsic privacy evaluations been run": true,
+          "Have evaluations been run across all applicable modalities": true,
+          "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
+          "Have privacy evaluations been run with human participants?": false
+        }
+      },
+      "5.2 Privacy, Likeness, and Publicity Harms": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "5.3 Intellectual Property and Information Security": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "Membership test to find if generated code was copied from the training corpus"
+          },
+          {
+            "type": "🏢",
+            "detail": "Code attribution tool to find the original author and license of the generated code"
+          },
+          {
+            "type": "🌐",
+            "detail": "https://arxiv.org/abs/2402.19173",
+            "name": "Asleep at the Keyboard Security Benchmark"
+          }
+        ],
+        "questions": {
+          "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
+          "Has the system been evaluated for other information security risks for in-scope uses": false
+        }
       },
+      "5.4 Privacy Evaluation Transparency and Documentation": {
+        "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "Documentation of training data information risk categories and consent status"
+          }
+        ],
+        "questions": {
+          "Documentation of the categories of training data that present information risk": true,
+          "Documentation of evaluation methods to replicate findings": true,
+          "Documentation of evaluation results to support comparison": true,
+          "Documentation of evaluation limitations": false,
+          "Documentation of deployment considerations": false
+        }
       }
     },
     "Financial Costs Evaluation": {
+      "6.1 Financial Costs Overview": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "6.2 Development and Training Costs": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "6.3 Deployment and Operation Costs": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "6.4 Financial Cost Documentation and Transparency": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
       }
     },
     "Data and Content Moderation Labor Evaluation": {
+      "7.1 Labor Evaluation Overview": {
         "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "PII annotations by human annotators with fair wage"
+          }
+        ],
+        "questions": {
+          "Evaluation of labor practices at various stages": true,
+          "Have labor conditions been evaluated for different worker categories": true,
+          "Have labor evaluations been run across all applicable task types": false,
+          "Have labor practices been evaluated against established industry standards": true,
+          "Have labor evaluations included both direct employees and contracted workers": false,
+          "Have evaluations considered different regional and jurisdictional contexts": true
+        }
+      },
+      "7.2 Working Conditions and Compensation": {
+        "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "PII annotations by human annotators with fair wage"
+          }
+        ],
+        "questions": {
+          "Assessment of compensation relative to local living wages and industry standards": true,
+          "Assessment of job security and employment classification": false,
+          "Evaluation of workplace safety, worker protections and rights": false,
+          "Assessment of worker autonomy and task assignment practices": false,
+          "Evaluation of power dynamics and worker feedback mechanisms": false
+        }
+      },
+      "7.3 Worker Wellbeing and Support": {
+        "status": "N/A",
+        "sources": [],
+        "questions": {}
+      },
+      "7.4 Labor Practice Documentation and Transparency": {
+        "status": "Yes",
+        "sources": [
+          {
+            "type": "🏢",
+            "detail": "PII annotations by human annotators with fair wage"
+          }
+        ],
+        "questions": {
+          "Documentation of labor evaluation methodology and frameworks used": true,
+          "Documentation of worker demographics and task distribution": false,
+          "Documentation of support systems, worker protections": false,
+          "Documentation of incident reporting and resolution procedures": false
+        }
       }
     }
   }