Upload Your First Model
Learn how to upload your model to the Clarifai platform
The Clarifai platform allows you to upload custom models for a wide range of use cases. With just a few simple steps, you can get your models up and running and leverage the platform’s powerful capabilities.
Let's demonstrate how you can upload the Llama-3_2-1B-Instruct model from Hugging Face to the Clarifai platform.
Step 1: Perform Prerequisites
Install Clarifai Package
Install the latest version of the clarifai
Python SDK. This also installs the Clarifai Command Line Interface (CLI), which we'll use for uploading the model.
- Bash
pip install --upgrade clarifai
Set a PAT Key
You need to set the CLARIFAI_PAT
(Personal Access Token) as an environment variable. You can generate the PAT key in your personal settings page by navigating to the Security section.
This token is essential for authenticating your connection to the Clarifai platform.
- Bash
export CLARIFAI_PAT=YOUR_PERSONAL_ACCESS_TOKEN_HERE
Step 2: Create Files
Create a project directory and organize your files as indicated below to fit the requirements of uploading models to the Clarifai platform.
your_model_directory/
├── 1/
│ └── model.py
├── requirements.txt
└── config.yaml
- your_model_directory/ – The main directory containing your model files.
- 1/ – A subdirectory that holds the model file (Note that the folder is named as 1).
- model.py – Contains the code that defines your model, including loading the model and running inference.
- requirements.txt – Lists the Python libraries and dependencies required to run your model.
- config.yaml – Contains model metadata and configuration details necessary for building the Docker image, defining compute resources, and uploading the model to Clarifai.
- 1/ – A subdirectory that holds the model file (Note that the folder is named as 1).
Add the following snippets to each of the respective files.
model.py
- Python
import os
from threading import Thread
from typing import Iterator, List, Optional
import torch
from clarifai.runners.models.model_class import ModelClass
from clarifai.runners.models.model_builder import ModelBuilder
from clarifai.utils.logging import logger
from clarifai_grpc.grpc.api import resources_pb2, service_pb2
from clarifai_grpc.grpc.api.status import status_code_pb2, status_pb2
from google.protobuf import json_format
from transformers import (AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer)
# Custom streamer for batched text generation
class BatchTextIteratorStreamer(TextIteratorStreamer):
"""A custom streamer that handles batched text generation."""
def __init__(self,
batch_size: int,
tokenizer: "AutoTokenizer",
skip_prompt: bool = False,
timeout: Optional[float] = None,
**decode_kwargs):
super().__init__(tokenizer, skip_prompt, timeout, **decode_kwargs)
self.batch_size = batch_size
self.token_cache = [[] for _ in range(batch_size)]
self.print_len = [0 for _ in range(batch_size)]
self.generate_exception = None
def put(self, value):
if len(value.shape) != 2:
value = torch.reshape(value, (self.batch_size, value.shape[0] // self.batch_size))
if self.skip_prompt and self.next_tokens_are_prompt:
self.next_tokens_are_prompt = False
return
printable_texts = list()
for idx in range(self.batch_size):
self.token_cache[idx].extend(value[idx].tolist())
text = self.tokenizer.decode(self.token_cache[idx], **self.decode_kwargs)
if text.endswith("\n"):
printable_text = text[self.print_len[idx]:]
self.token_cache[idx] = []
self.print_len[idx] = 0
# If the last token is a CJK character, we print the characters.
elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
printable_text = text[self.print_len[idx]:]
self.print_len[idx] += len(printable_text)
else:
printable_text = text[self.print_len[idx]:text.rfind(" ") + 1]
self.print_len[idx] += len(printable_text)
printable_texts.append(printable_text)
self.on_finalized_text(printable_texts)
def end(self):
printable_texts = list()
for idx in range(self.batch_size):
if len(self.token_cache[idx]) > 0:
text = self.tokenizer.decode(self.token_cache[idx], **self.decode_kwargs)
printable_text = text[self.print_len[idx]:]
self.token_cache[idx] = []
self.print_len[idx] = 0
else:
printable_text = ""
printable_texts.append(printable_text)
self.next_tokens_are_prompt = True
self.on_finalized_text(printable_texts, stream_end=True)
def on_finalized_text(self, texts: List[str], stream_end: bool = False):
self.text_queue.put(texts, timeout=self.timeout)
if stream_end:
self.text_queue.put(self.stop_signal, timeout=self.timeout)
# Helper function to create an output
def create_output(text="", code=status_code_pb2.SUCCESS):
return resources_pb2.Output(
data=resources_pb2.Data(text=resources_pb2.Text(raw=text)),
status=status_pb2.Status(code=code))
# Helper function to get the inference params
def get_inference_params(request) -> dict:
"""Get the inference params from the request."""
inference_params = {}
if request.model.model_version.id != "":
output_info = request.model.model_version.output_info
output_info = json_format.MessageToDict(output_info, preserving_proto_field_name=True)
if "params" in output_info:
inference_params = output_info["params"]
return inference_params
# Helper function to parse the inference params
def parse_inference_params(request):
default_params = {
"temperature": 0.7,
"max_tokens": 100,
"top_k": 50,
"top_p": 1.0,
"do_sample": True,
}
inference_params = get_inference_params(request)
return {
"temperature": inference_params.get("temperature", default_params["temperature"]),
"max_tokens": int(inference_params.get("max_tokens", default_params["max_tokens"])),
"top_k": int(inference_params.get("top_k", default_params["top_k"])),
"top_p": inference_params.get("top_p", default_params["top_p"]),
"do_sample": inference_params.get("do_sample", default_params["do_sample"]),
}
class MyModel(ModelClass):
"""A custom runner that loads the model and generates text using batched inference."""
def load_model(self):
"""Load the model here."""
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info(f"Running on device: {self.device}")
# Load checkpoints
model_path = os.path.dirname(os.path.dirname(__file__))
builder = ModelBuilder(model_path, download_validation_only=True)
checkpoints = builder.download_checkpoints(stage="runtime")
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(checkpoints,)
self.tokenizer.pad_token = self.tokenizer.eos_token # Set pad token to eos token
self.model = AutoModelForCausalLM.from_pretrained(
checkpoints,
low_cpu_mem_usage=True,
device_map=self.device,
torch_dtype=torch.bfloat16,
)
logger.info("Done loading!")
def predict(self,
request: service_pb2.PostModelOutputsRequest) -> service_pb2.MultiOutputResponse:
"""This method generates outputs text for the given inputs using the model."""
inference_params = parse_inference_params(request)
prompts = [inp.data.text.raw for inp in request.inputs]
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to(self.device)
output_tokens = self.model.generate(
**inputs,
max_new_tokens=inference_params["max_tokens"],
do_sample=inference_params["do_sample"],
temperature=inference_params["temperature"],
top_k=inference_params["top_k"],
top_p=inference_params["top_p"],
eos_token_id=self.tokenizer.eos_token_id,
)
outputs_text = self.tokenizer.batch_decode(
output_tokens[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
outputs = []
for text in outputs_text:
outputs.append(create_output(text=text, code=status_code_pb2.SUCCESS))
return service_pb2.MultiOutputResponse(
outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS))
def generate(self, request: service_pb2.PostModelOutputsRequest
) -> Iterator[service_pb2.MultiOutputResponse]:
"""This method generates stream of outputs for the given batch of inputs using the model."""
inference_params = parse_inference_params(request)
prompts = [inp.data.text.raw for inp in request.inputs]
batch_size = len(prompts)
# Initialize the custom streamer
streamer = BatchTextIteratorStreamer(
batch_size=batch_size,
tokenizer=self.tokenizer,
skip_prompt=True,
decode_kwargs={
"skip_special_tokens": True
})
# Tokenize the inputs
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to(self.device)
generation_kwargs = {
"input_ids": inputs.input_ids,
"attention_mask": inputs.attention_mask,
"max_new_tokens": inference_params["max_tokens"],
"do_sample": inference_params["do_sample"],
"temperature": inference_params["temperature"],
"top_k": inference_params["top_k"],
"top_p": inference_params["top_p"],
"eos_token_id": self.tokenizer.eos_token_id,
"streamer": streamer,
}
# Start generation in a separate thread
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
thread.start()
# Initialize outputs
outputs = [create_output() for _ in range(batch_size)]
try:
for streamed_texts in streamer: # Iterate over new texts generated
for idx, text in enumerate(streamed_texts): # Iterate over each batch
outputs[idx].data.text.raw = text # Append new text to each output
outputs[idx].status.code = status_code_pb2.SUCCESS
# Yield the current outputs
yield service_pb2.MultiOutputResponse(
outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS))
finally:
thread.join()
def stream(self, request_iterator: Iterator[service_pb2.PostModelOutputsRequest]
) -> Iterator[service_pb2.MultiOutputResponse]:
raise NotImplementedError("Stream method is not implemented for the models.")
requirements.txt
- Text
torch==2.5.1
tokenizers>=0.21.0
transformers>=4.47.0
accelerate>=1.2.0
scipy==1.10.1
optimum>=1.23.3
xformers==0.0.28.post3
protobuf==5.27.3
einops>=0.8.0
requests==2.32.3
clarifai
config.yaml
In the model
section of the config.yaml
file, specify your model ID, Clarifai user ID, and Clarifai app ID. These will define where your model will be uploaded on the Clarifai platform. Also, specify hf_token
to authenticate your connection to Hugging Face services.
- YAML
# This is the sample config file for the llama model.
model:
id: "llama_3_2_1b_instruct"
user_id: "user_id"
app_id: "app_id"
model_type_id: "text-to-text"
build_info:
python_version: "3.11"
inference_compute_info:
cpu_limit: "1"
cpu_memory: "13Gi"
num_accelerators: 1
accelerator_type: ["NVIDIA*"]
accelerator_memory: "18Gi"
checkpoints:
type: "huggingface"
repo_id: "meta-llama/Llama-3.2-1B-Instruct"
when: "runtime"
hf_token: "hf_token"
Step 3: Upload the Model
Once your custom model is ready, upload it to the Clarifai platform by navigating to its directory and running the following command:
- CLI
clarifai model upload
Congratulations — you've just uploaded your first model to the Clarifai platform!