deepseek配置文件
services:
vllm:
container_name: vllm
ulimits:
stack: 67108864
memlock: -1
restart: always
image: vllm/vllm-openai
ipc: host
volumes:
- /mnt/models/:/models
command: [
"--model", "/models/deepseek-70b",
"--served-model-name", "DeepSeek-R1-70B",
"--gpu-memory-utilization", "0.9",
"--max-num-batched-tokens", "32768",
"--max-num-seqs", "256",
"--tensor-parallel-size", "8",
"--max-model-len", "32768",
"--enable-reasoning","--reasoning-parser","deepseek_r1"]
ports:
- "127.0.0.1:8000:8000" # 关键修改点
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- VLLM_DISABLE_TELEMETRY=1
- NCCL_P2P_DISABLE=1
- NCCL_P2P_LEVEL=NVL
- NCCL_ALGO=Ring
- NCCL_MIN_NCHANNELS=16
- NCCL_DEBUG=WARN
- VLLM_MAX_NUM_TOKENS=100000
- CUDA_LAUNCH_BLOCKING=0
- CUBLAS_WORKSPACE_CONFIG=:4096:8
bge_embedding:
container_name: bge_embedding
image: ghcr.io/huggingface/text-embeddings-inference:latest
restart: unless-stopped
volumes:
- /mnt/models/bge-large-zh-v1.5:/models/bge-large-zh-v1.5 # 确保模型路径正确
command: [
"--model-id", "/models/bge-large-zh-v1.5",
"--port=8001",
"--revision=main",
"--pooling=cls",
"--auto-truncate", # 自动截断超长文本
"--max-client-batch-size=64",
"--max-batch-tokens=65536"
]
ports:
- "127.0.0.1:8001:8001"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1 # 根据需求调整GPU数量
capabilities: [gpu]
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_HUB_ENABLE_HF_TRANSFER=1
更多推荐

所有评论(0)