Spaces:

onnx-community
/

convert-to-onnx

Running

App Files Files Community

Felladrin commited on 12 days ago

Commit

b6efc2b

1 Parent(s): 783645d

Improve the generated Readme with the original model datacard and usage reference from transformers.js docs

Browse files

Files changed (2) hide show

app.py +138 -13
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -3,12 +3,14 @@ import os
 import subprocess
 import sys
 import shutil
 from pathlib import Path
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 import streamlit as st
-from huggingface_hub import HfApi, whoami
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -58,6 +60,86 @@ class ModelConverter:
         self.config = config
         self.api = HfApi(token=config.hf_token)
     def setup_repository(self) -> None:
         """Ensure the bundled transformers.js repository is present."""
         if not self.config.repo_path.exists():
@@ -112,6 +194,14 @@ class ModelConverter:
             if output_attentions:
                 extra_args.append("--output_attentions")
             result = self._run_conversion_subprocess(
                 input_model_id, extra_args=extra_args or None
             )
@@ -133,9 +223,8 @@ class ModelConverter:
             readme_path = f"{model_folder_path}/README.md"
-            if not os.path.exists(readme_path):
-                with open(readme_path, "w") as file:
-                    file.write(self.generate_readme(input_model_id))
             self.api.upload_folder(
                 folder_path=str(model_folder_path), repo_id=output_model_id
@@ -147,18 +236,54 @@ class ModelConverter:
             shutil.rmtree(model_folder_path, ignore_errors=True)
     def generate_readme(self, imi: str):
-        return (
-            "---\n"
-            "library_name: transformers.js\n"
-            "base_model:\n"
-            f"- {imi}\n"
-            "---\n\n"
-            f"# {imi.split('/')[-1]} (ONNX)\n\n"
             f"This is an ONNX version of [{imi}](https://huggingface.co/{imi}). "
             "It was automatically converted and uploaded using "
-            "[this space](https://huggingface.co/spaces/onnx-community/convert-to-onnx).\n"
         )
 def main():
     """Main application entry point."""
@@ -195,7 +320,7 @@ def main():
         if config.hf_username == input_model_id.split("/")[0]:
             same_repo = st.checkbox(
-                "Do you want to upload the ONNX weights to the same repository?"
             )
         else:
             same_repo = False

 import subprocess
 import sys
 import shutil
+import re
 from pathlib import Path
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 import streamlit as st
+from huggingface_hub import HfApi, whoami, model_info, hf_hub_download
+import yaml
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.config = config
         self.api = HfApi(token=config.hf_token)
+    def _fetch_original_readme(self, repo_id: str) -> str:
+        try:
+            path = hf_hub_download(
+                repo_id=repo_id, filename="README.md", token=self.config.hf_token
+            )
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+        except Exception:
+            return ""
+    def _strip_yaml_frontmatter(self, text: str) -> str:
+        if not text:
+            return ""
+        if text.startswith("---"):
+            m = re.match(r"^---[\s\S]*?\n---\s*\n", text)
+            if m:
+                return text[m.end() :]
+        return text
+    def _extract_yaml_frontmatter(self, text: str) -> Tuple[dict, str]:
+        """Return (frontmatter_dict, body). If no frontmatter, returns ({}, text)."""
+        if not text or not text.startswith("---"):
+            return {}, text or ""
+        m = re.match(r"^---\s*\n([\s\S]*?)\n---\s*\n", text)
+        if not m:
+            return {}, text
+        fm_text = m.group(1)
+        body = text[m.end() :]
+        try:
+            data = yaml.safe_load(fm_text)
+            if not isinstance(data, dict):
+                data = {}
+        except Exception:
+            data = {}
+        return data, body
+    def _pipeline_docs_url(self, pipeline_tag: Optional[str]) -> Optional[str]:
+        base = "https://huggingface.co/docs/transformers.js/api/pipelines"
+        if not pipeline_tag:
+            return base
+        mapping = {
+            "text-classification": "TextClassificationPipeline",
+            "token-classification": "TokenClassificationPipeline",
+            "question-answering": "QuestionAnsweringPipeline",
+            "fill-mask": "FillMaskPipeline",
+            "text2text-generation": "Text2TextGenerationPipeline",
+            "summarization": "SummarizationPipeline",
+            "translation": "TranslationPipeline",
+            "text-generation": "TextGenerationPipeline",
+            "zero-shot-classification": "ZeroShotClassificationPipeline",
+            "feature-extraction": "FeatureExtractionPipeline",
+            "image-feature-extraction": "ImageFeatureExtractionPipeline",
+            "audio-classification": "AudioClassificationPipeline",
+            "zero-shot-audio-classification": "ZeroShotAudioClassificationPipeline",
+            "automatic-speech-recognition": "AutomaticSpeechRecognitionPipeline",
+            "image-to-text": "ImageToTextPipeline",
+            "image-classification": "ImageClassificationPipeline",
+            "image-segmentation": "ImageSegmentationPipeline",
+            "background-removal": "BackgroundRemovalPipeline",
+            "zero-shot-image-classification": "ZeroShotImageClassificationPipeline",
+            "object-detection": "ObjectDetectionPipeline",
+            "zero-shot-object-detection": "ZeroShotObjectDetectionPipeline",
+            "document-question-answering": "DocumentQuestionAnsweringPipeline",
+            "text-to-audio": "TextToAudioPipeline",
+            "image-to-image": "ImageToImagePipeline",
+            "depth-estimation": "DepthEstimationPipeline",
+        }
+        cls = mapping.get(pipeline_tag)
+        if not cls:
+            return base
+        return f"{base}#module_pipelines.{cls}"
+    def _map_pipeline_to_task(self, pipeline_tag: Optional[str]) -> Optional[str]:
+        if not pipeline_tag:
+            return None
+        synonyms = {
+            "vqa": "visual-question-answering",
+        }
+        return synonyms.get(pipeline_tag, pipeline_tag)
     def setup_repository(self) -> None:
         """Ensure the bundled transformers.js repository is present."""
         if not self.config.repo_path.exists():
             if output_attentions:
                 extra_args.append("--output_attentions")
+            try:
+                info = model_info(repo_id=input_model_id, token=self.config.hf_token)
+                task = self._map_pipeline_to_task(getattr(info, "pipeline_tag", None))
+                if task:
+                    extra_args.extend(["--task", task])
+            except Exception:
+                pass
             result = self._run_conversion_subprocess(
                 input_model_id, extra_args=extra_args or None
             )
             readme_path = f"{model_folder_path}/README.md"
+            with open(readme_path, "w") as file:
+                file.write(self.generate_readme(input_model_id))
             self.api.upload_folder(
                 folder_path=str(model_folder_path), repo_id=output_model_id
             shutil.rmtree(model_folder_path, ignore_errors=True)
     def generate_readme(self, imi: str):
+        try:
+            info = model_info(repo_id=imi, token=self.config.hf_token)
+            pipeline_tag = getattr(info, "pipeline_tag", None)
+        except Exception:
+            pipeline_tag = None
+        original_text = self._fetch_original_readme(imi)
+        original_meta, original_body = self._extract_yaml_frontmatter(original_text)
+        original_body = (
+            original_body or self._strip_yaml_frontmatter(original_text)
+        ).strip()
+        merged_meta = {}
+        if isinstance(original_meta, dict):
+            merged_meta.update(original_meta)
+        merged_meta["library_name"] = "transformers.js"
+        merged_meta["base_model"] = [imi]
+        if pipeline_tag is not None:
+            merged_meta["pipeline_tag"] = pipeline_tag
+        fm_yaml = yaml.safe_dump(merged_meta, sort_keys=False).strip()
+        header = f"---\n{fm_yaml}\n---\n\n"
+        parts: List[str] = []
+        parts.append(header)
+        parts.append(f"# {imi.split('/')[-1]} (ONNX)\n")
+        parts.append(
             f"This is an ONNX version of [{imi}](https://huggingface.co/{imi}). "
             "It was automatically converted and uploaded using "
+            "[this Hugging Face Space](https://huggingface.co/spaces/onnx-community/convert-to-onnx)."
         )
+        docs_url = self._pipeline_docs_url(pipeline_tag)
+        if docs_url:
+            parts.append("\n## Usage with Transformers.js\n")
+            if pipeline_tag:
+                parts.append(
+                    f"See the pipeline documentation for `{pipeline_tag}`: {docs_url}"
+                )
+            else:
+                parts.append(f"See the pipelines documentation: {docs_url}")
+        if original_body:
+            parts.append("\n---\n")
+            parts.append(original_body)
+        return "\n\n".join(parts) + "\n"
 def main():
     """Main application entry point."""
         if config.hf_username == input_model_id.split("/")[0]:
             same_repo = st.checkbox(
+                "Upload the ONNX weights to the existing repository"
             )
         else:
             same_repo = False

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 huggingface_hub==0.35.3
 streamlit==1.50.0
 onnxscript==0.5.4
 onnxconverter_common==1.16.0
 onnx_graphsurgeon==0.5.8

 huggingface_hub==0.35.3
 streamlit==1.50.0
+PyYAML==6.0.2
 onnxscript==0.5.4
 onnxconverter_common==1.16.0
 onnx_graphsurgeon==0.5.8