Offline inference, batch processing, script integration
vLLM's LLM class provides a simple Python API for offline inference. This skill covers the LLM class usage, batch processing, and integration into Python scripts.
pip install vllm)from vllm import LLM, SamplingParams
# Initialize model
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
# Define sampling parameters
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.95,
max_tokens=100
)
# Single prompt
prompt = "What is machine learning?"
output = llm.generate(prompt, sampling_params)
print(output[0].outputs[0].text)
prompts = [
"What is AI?",
"Explain Python programming",
"Define cloud computing",
"What is deep learning?"
]
# Process all prompts in batch
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print(f"Prompt: {prompt}")
print(f"Response: {output.outputs[0].text}")
print("---")
import asyncio
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
# Initialize async engine
engine_args = AsyncEngineArgs(
model="meta-llama/Llama-2-7b-chat-hf"
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
async def generate_async(prompt):
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
async for output in engine.generate(prompt, sampling_params, request_id="1"):
final_output = output
return final_output
# Run async generation
result = asyncio.run(generate_async("What is AI?"))
print(result.outputs[0].text)
import json
outputs = llm.generate(prompts, sampling_params)
results = []
for prompt, output in zip(prompts, outputs):
results.append({
"prompt": prompt,
"response": output.outputs[0].text,
"tokens": len(output.outputs[0].token_ids)
})
# Save to JSON
with open("outputs.json", "w") as f:
json.dump(results, f, indent=2)
import pandas as pd
from vllm import LLM, SamplingParams
# Load dataset
df = pd.read_csv("input.csv")
# Initialize model once
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
sampling_params = SamplingParams(temperature=0.7, max_tokens=150)
# Process in batches
batch_size = 32
results = []
for idx in range(0, len(df), batch_size):
batch = df.iloc[idx:idx+batch_size]
prompts = batch["input_text"].tolist()
outputs = llm.generate(prompts, sampling_params)
responses = [o.outputs[0].text for o in outputs]
results.extend(responses)
print(f"Processed {min(idx+batch_size, len(df))}/{len(df)}")
df["output"] = results
df.to_csv("output.csv", index=False)
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
# Apply chat template
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is Python?"}
]
# Tokenizer applies chat template automatically
prompt = llm.get_tokenizer().apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm.generate(prompt, SamplingParams(max_tokens=100))
print(output[0].outputs[0].text)
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
# For streaming, use AsyncLLMEngine
# See Step 3 for async example
# Non-streaming with progress tracking
prompts = [f"Question {idx}" for idx in range(100)]
outputs = llm.generate(prompts, SamplingParams(max_tokens=50))
for idx, output in enumerate(outputs):
print(f"[{idx+1}/{len(prompts)}] Done")
from vllm import LLM
# Tensor parallelism across multiple GPUs
llm = LLM(
model="meta-llama/Llama-2-70b-chat-hf",
tensor_parallel_size=4 # Use 4 GPUs
)
output = llm.generate("Hello, how are you?")
print(output[0].outputs[0].text)
Solution:
# Reduce GPU memory utilization
llm = LLM(
model="meta-llama/Llama-2-7b-chat-hf",
gpu_memory_utilization=0.8, # Default is 0.9
max_model_len=4096 # Reduce max sequence length
)
# Or use quantization
llm = LLM(
model="meta-llama/Llama-2-7b-chat-hf",
quantization="awq" # or "gptq", "fp8"
)
Solution:
First inference includes model loading time. Warm up:
# Warm up with a dummy prompt
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
_ = llm.generate("Hello", SamplingParams(max_tokens=10))
# Now actual inference is faster
outputs = llm.generate(actual_prompts)
Solution:
# Ensure proper cleanup
import asyncio
from vllm import AsyncLLMEngine
async def main():
engine = AsyncLLMEngine.from_engine_args(engine_args)
try:
result = await generate_async(prompt)
return result
finally:
# Clean up
await engine.abort_all_requests()
asyncio.run(main())