Run inference with Hugging Face models including text generation, embeddings, and vision models.
from huggingface_hub import InferenceClient
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2")
response = client.text_generation(
prompt="Once upon a time",
max_new_tokens=100,
temperature=0.7
)
from huggingface_hub import InferenceClient
client = InferenceClient(model="BAAI/bge-large-en-v1.5")
embedding = client.feature_extraction(
text="Hello, world!"
)
from transformers import pipeline
# Text generation
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_new_tokens=50)
# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(long_text, max_length=100)
# Classification
classifier = pipeline("zero-shot-classification")
labels = ["education", "business", "tech"]
result = classifier(text, labels)
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
pipe.to("cuda")
image = pipe("a photo of an astronaut riding a horse").images[0]
# Create inference endpoint
huggingface-cli endpoint create
# Configure for production
huggingface-cli endpoint set-config --name my-endpoint \
--compute-type gpu-small \
--model-name mistralai/Mistral-7B-Instruct-v0.2
| Task | Recommended Models |
|---|---|
| Text generation | Mistral-7B, Llama-3, Gemma |
| Embeddings | BGE-large, E5, Instructor |
| Image gen | SDXL, Flux, Playground |
| Vision | GPT-4V, LLaVA, CogVLM |
| Speech | Whisper, Bark, MusicGen |
accelerate library for efficient inferencebitsandbytes