services:
  vllm:
    container_name: vllm
    ulimits:
      stack: 67108864
      memlock: -1
    restart: always
    image: vllm/vllm-openai
    ipc: host
    volumes:
      - /mnt/models/:/models
    command: [
              "--model", "/models/deepseek-70b",
              "--served-model-name", "DeepSeek-R1-70B",
              "--gpu-memory-utilization", "0.9",
              "--max-num-batched-tokens", "32768",
              "--max-num-seqs", "256",
              "--tensor-parallel-size", "8",
              "--max-model-len", "32768",
              "--enable-reasoning","--reasoning-parser","deepseek_r1"]
    ports:
      - "127.0.0.1:8000:8000"  # 关键修改点
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      - VLLM_DISABLE_TELEMETRY=1
      - NCCL_P2P_DISABLE=1
      - NCCL_P2P_LEVEL=NVL
      - NCCL_ALGO=Ring
      - NCCL_MIN_NCHANNELS=16
      - NCCL_DEBUG=WARN
      - VLLM_MAX_NUM_TOKENS=100000
      - CUDA_LAUNCH_BLOCKING=0
      - CUBLAS_WORKSPACE_CONFIG=:4096:8
  bge_embedding:
    container_name: bge_embedding
    image: ghcr.io/huggingface/text-embeddings-inference:latest
    restart: unless-stopped
    volumes:
      - /mnt/models/bge-large-zh-v1.5:/models/bge-large-zh-v1.5  # 确保模型路径正确
    command: [
      "--model-id", "/models/bge-large-zh-v1.5",
      "--port=8001",
      "--revision=main",
      "--pooling=cls",
      "--auto-truncate",          # 自动截断超长文本
      "--max-client-batch-size=64",
      "--max-batch-tokens=65536"
    ]
    ports:
      - "127.0.0.1:8001:8001"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1  # 根据需求调整GPU数量
              capabilities: [gpu]
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HF_HUB_ENABLE_HF_TRANSFER=1  

Logo

这里是“一人公司”的成长家园。我们提供从产品曝光、技术变现到法律财税的全栈内容,并连接云服务、办公空间等稀缺资源,助你专注创造,无忧运营。

更多推荐