Merge pull request #31 from victor23k/bert-text-classification-example

mortont · web-flow · commit 0304cc29ec61 · 2024-05-17T08:11:28.000-07:00
Add text classification example with distilbert
diff --git a/examples/distilbert/README.md b/examples/distilbert/README.md
@@ -0,0 +1,9 @@
+# DistilBert exported to ONNX with HuggingFace transformers
+
+### Running
+
+Run `python export.py` to create the ONNX model for distilbert/distilbert-base-uncased-finetuned-sst-2-english, then `mix run` the `distilbert_classification.exs` script.
+
+### Labels
+
+When exporting the model from huggingface transformers to ONNX, a `config.json` file is added to the chosen directory. This file has the id to label mappings and you can extract them directly to give a label to the input, as shwon in `distilbert_classification.exs`. 
diff --git a/examples/distilbert/distilbert_classification.exs b/examples/distilbert/distilbert_classification.exs
@@ -0,0 +1,34 @@
+defmodule Inference do
+  def id_to_label(id) do
+    {:ok, config_json} = File.read("./models/distilbert-onnx/config.json")
+    {:ok, %{"id2label" => id2label}} = Jason.decode(config_json)
+    Map.get(id2label, to_string(id))
+  end
+
+  def run() do
+    model = Ortex.load("./models/distilbert-onnx/model.onnx")
+
+    text =
+      "the movie had a lot of nuance and interesting artistic choices, would like to see more support in the industry for these types of productions"
+
+    {:ok, tokenizer} = Tokenizers.Tokenizer.from_file("./models/distilbert-onnx/tokenizer.json")
+    {:ok, encoding} = Tokenizers.Tokenizer.encode(tokenizer, text)
+
+    input = Nx.tensor([Tokenizers.Encoding.get_ids(encoding)])
+    mask = Nx.tensor([Tokenizers.Encoding.get_attention_mask(encoding)])
+
+    {output} = Ortex.run(model, {input, mask})
+
+    IO.inspect(output)
+
+    IO.inspect(
+      output
+      |> Nx.backend_transfer()
+      |> Nx.argmax()
+      |> Nx.to_number()
+      |> id_to_label()
+    )
+  end
+end
+
+Inference.run()
diff --git a/examples/distilbert/export.py b/examples/distilbert/export.py
@@ -0,0 +1,20 @@
+"""
+### Install dependencies:
+
+   $ pip install transformers
+   $ pip install optimum
+   $ pip install "transformers[onnx]"
+
+"""
+
+from transformers import DistilBertTokenizer
+from optimum.onnxruntime import ORTModelForSequenceClassification
+
+save_directory = "./models/distilbert-onnx/"
+
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+model = ORTModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", export=True)
+print(model)
+
+model.save_pretrained(save_directory)
+tokenizer.save_pretrained(save_directory)