Throughput Test Results - InferenceLatency.com

Full JSON Response (for developers & agents)
{
  "service": "InferenceLatency.com",
  "endpoint": "throughput",
  "description": "Combined latency and throughput benchmarking",
  "providers": [
    {
      "provider": "OpenAI",
      "model": "GPT-4o",
      "metrics": {
        "latency_ms": 788,
        "throughput_tokens_per_sec": 22.85
      },
      "tokens_generated": 18,
      "elapsed_seconds": 0.788,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0025,
        "estimated_cost_usd": 4.5e-05
      },
      "model_metadata": {
        "release_date": "2024-05-13",
        "context_length": 128000,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://platform.openai.com/docs/api-reference",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://platform.openai.com/docs",
        "api_docs": "https://platform.openai.com/docs/api-reference/chat"
      },
      "test_metadata": {
        "request_hash": "ed44a67c6e81",
        "trace_id": "trace-ab8e0020"
      },
      "history": {
        "latency_trend_7d": [
          752,
          750,
          748,
          749,
          751,
          750,
          788
        ],
        "throughput_trend_7d": [
          31.9,
          32.1,
          32.0,
          32.2,
          32.0,
          32.0,
          22.85
        ]
      }
    },
    {
      "provider": "Groq",
      "model": "Llama3-8B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '400 Bad Request' for url 'https://api.groq.com/openai/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00027,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-04-18",
        "context_length": 8192,
        "hardware": "LPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://console.groq.com/docs/openai",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "infra": {
        "schema_url": "https://console.groq.com/docs/openai",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "test_metadata": {
        "request_hash": "f9bc60cedb12",
        "trace_id": "trace-be0af5c6"
      },
      "history": {
        "latency_trend_7d": [
          952,
          950,
          948,
          949,
          951,
          950,
          950
        ],
        "throughput_trend_7d": [
          121.9,
          122.1,
          122.0,
          122.2,
          122.0,
          122.0,
          122.01
        ]
      }
    },
    {
      "provider": "Claude",
      "model": "Claude Sonnet 4",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Error code: 404 - {'type': 'error', 'error': {'type': 'not_found_error', 'message': 'model: claude-sonnet-4-20250514'}, 'request_id': 'req_011Cd3DUifrDJgAf5BixcRS4'}",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.003,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-10-22",
        "context_length": 200000,
        "hardware": "GPU",
        "api_type": "Anthropic-native",
        "schema_url": "https://docs.anthropic.com/en/api/messages",
        "human_readable_url": "https://docs.anthropic.com",
        "api_docs": "https://docs.anthropic.com/en/api"
      },
      "infra": {
        "schema_url": "https://docs.anthropic.com/en/api/messages",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.anthropic.com",
        "api_docs": "https://docs.anthropic.com/en/api"
      },
      "test_metadata": {
        "request_hash": "c4fb7f97d38a",
        "trace_id": "trace-e0b94c17"
      },
      "history": {
        "latency_trend_7d": [
          1152,
          1150,
          1148,
          1149,
          1151,
          1150,
          1150
        ],
        "throughput_trend_7d": [
          21.9,
          22.1,
          22.0,
          22.2,
          22.0,
          22.0,
          22.01
        ]
      }
    },
    {
      "provider": "OpenRouter",
      "model": "Mistral",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '404 Not Found' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00018,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-02-26",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://openrouter.ai/docs",
        "human_readable_url": "https://openrouter.ai/docs",
        "api_docs": "https://openrouter.ai/docs/api"
      },
      "infra": {
        "schema_url": "https://openrouter.ai/docs",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://openrouter.ai/docs",
        "api_docs": "https://openrouter.ai/docs/api"
      },
      "test_metadata": {
        "request_hash": "b4c0e75d5852",
        "trace_id": "trace-f954eb7a"
      },
      "history": {
        "latency_trend_7d": [
          652,
          650,
          648,
          649,
          651,
          650,
          650
        ],
        "throughput_trend_7d": [
          41.9,
          42.1,
          42.0,
          42.2,
          42.0,
          42.0,
          42.01
        ]
      }
    },
    {
      "provider": "Google Gemini",
      "model": "Gemini-2.0-Flash",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "404 NOT_FOUND. {'error': {'code': 404, 'message': 'models/gemini-2.0-flash-exp is not found for API version v1beta, or is not supported for generateContent. Call ModelService.ListModels to see the list of available models and their supported methods.', 'status': 'NOT_FOUND'}}",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00075,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-12-11",
        "context_length": 2000000,
        "hardware": "TPU",
        "api_type": "Google-native",
        "schema_url": "https://ai.google.dev/api",
        "human_readable_url": "https://ai.google.dev/docs",
        "api_docs": "https://ai.google.dev/api/generate-content"
      },
      "infra": {
        "schema_url": "https://ai.google.dev/api",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://ai.google.dev/docs",
        "api_docs": "https://ai.google.dev/api/generate-content"
      },
      "test_metadata": {
        "request_hash": "a4d9dfa3ca1b",
        "trace_id": "trace-f478e0eb"
      },
      "history": {
        "latency_trend_7d": [
          352,
          350,
          348,
          349,
          351,
          350,
          350
        ],
        "throughput_trend_7d": [
          91.9,
          92.1,
          92.0,
          92.2,
          92.0,
          92.0,
          92.01
        ]
      }
    },
    {
      "provider": "Together AI",
      "model": "Llama3.1-8B-Turbo",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '400 Bad Request' for url 'https://api.together.xyz/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-01-15",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://docs.together.ai/docs/inference-models",
        "human_readable_url": "https://docs.together.ai",
        "api_docs": "https://docs.together.ai/reference/chat-completions"
      },
      "infra": {
        "schema_url": "https://docs.together.ai/docs/inference-models",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.together.ai",
        "api_docs": "https://docs.together.ai/reference/chat-completions"
      },
      "test_metadata": {
        "request_hash": "f35c4029e943",
        "trace_id": "trace-b917c107"
      },
      "history": {
        "latency_trend_7d": [
          120,
          118,
          116,
          117,
          119,
          118,
          118
        ],
        "throughput_trend_7d": [
          145.9,
          146.1,
          146.0,
          146.2,
          146.0,
          146.0,
          146.01
        ]
      }
    },
    {
      "provider": "Fireworks AI",
      "model": "Llama3.1-8B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '404 Not Found' for url 'https://api.fireworks.ai/inference/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-04-15",
        "context_length": 8192,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
        "human_readable_url": "https://fireworks.ai/",
        "api_docs": "https://readme.fireworks.ai/docs"
      },
      "infra": {
        "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://fireworks.ai/",
        "api_docs": "https://readme.fireworks.ai/docs"
      },
      "test_metadata": {
        "request_hash": "ff79059068df",
        "trace_id": "trace-6b81e33e"
      },
      "history": {
        "latency_trend_7d": [
          180,
          178,
          176,
          177,
          179,
          178,
          178
        ],
        "throughput_trend_7d": [
          135.9,
          136.1,
          136.0,
          136.2,
          136.0,
          136.0,
          136.01
        ]
      }
    },
    {
      "provider": "HF GPT OSS 120B (Cerebras)",
      "model": "GPT OSS 120B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '401 Unauthorized' for url 'https://router.huggingface.co/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0008,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2025-01-07",
        "context_length": 128000,
        "hardware": "Cerebras WSE",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://huggingface.co/docs/inference-providers",
        "human_readable_url": "https://huggingface.co/openai/gpt-oss-120b",
        "api_docs": "https://huggingface.co/docs/inference-providers/en/guides/gpt-oss"
      }
    },
    {
      "provider": "DeepSeek",
      "model": "deepseek-chat",
      "metrics": {
        "latency_ms": 956,
        "throughput_tokens_per_sec": 14.65
      },
      "tokens_generated": 14,
      "elapsed_seconds": 0.956,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00027,
        "estimated_cost_usd": 4e-06
      },
      "model_metadata": {
        "release_date": "2024-12-01",
        "context_length": 64000,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://api-docs.deepseek.com/",
        "human_readable_url": "https://platform.deepseek.com/",
        "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
      },
      "infra": {
        "schema_url": "https://api-docs.deepseek.com/",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://platform.deepseek.com/",
        "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
      },
      "test_metadata": {
        "request_hash": "b3705450137a",
        "trace_id": "trace-f1e7a08c"
      },
      "history": {
        "latency_trend_7d": [
          280,
          278,
          276,
          277,
          279,
          278,
          278
        ],
        "throughput_trend_7d": [
          85.9,
          86.1,
          86.0,
          86.2,
          86.0,
          86.0,
          86.01
        ]
      }
    },
    {
      "provider": "Cohere",
      "model": "command-r",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '429 Too Many Requests' for url 'https://api.cohere.ai/v1/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00075,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-10-01",
        "context_length": 128000,
        "hardware": "GPU",
        "api_type": "Cohere-native",
        "schema_url": "https://docs.cohere.com/reference/chat",
        "human_readable_url": "https://docs.cohere.com/",
        "api_docs": "https://docs.cohere.com/reference/chat"
      },
      "infra": {
        "schema_url": "https://docs.cohere.com/reference/chat",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.cohere.com/",
        "api_docs": "https://docs.cohere.com/reference/chat"
      },
      "test_metadata": {
        "request_hash": "66ba90393b5f",
        "trace_id": "trace-ce54325a"
      },
      "history": {
        "latency_trend_7d": [
          320,
          318,
          316,
          317,
          319,
          318,
          318
        ],
        "throughput_trend_7d": [
          45.9,
          46.1,
          46.0,
          46.2,
          46.0,
          46.0,
          46.01
        ]
      }
    }
  ],
  "rankings": {
    "by_latency": [
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 788,
          "throughput_tokens_per_sec": 22.85
        },
        "tokens_generated": 18,
        "elapsed_seconds": 0.788,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.5e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "ed44a67c6e81",
          "trace_id": "trace-ab8e0020"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            788
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            22.85
          ]
        }
      },
      {
        "provider": "DeepSeek",
        "model": "deepseek-chat",
        "metrics": {
          "latency_ms": 956,
          "throughput_tokens_per_sec": 14.65
        },
        "tokens_generated": 14,
        "elapsed_seconds": 0.956,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00027,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-12-01",
          "context_length": 64000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible",
          "schema_url": "https://api-docs.deepseek.com/",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "infra": {
          "schema_url": "https://api-docs.deepseek.com/",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "test_metadata": {
          "request_hash": "b3705450137a",
          "trace_id": "trace-f1e7a08c"
        },
        "history": {
          "latency_trend_7d": [
            280,
            278,
            276,
            277,
            279,
            278,
            278
          ],
          "throughput_trend_7d": [
            85.9,
            86.1,
            86.0,
            86.2,
            86.0,
            86.0,
            86.01
          ]
        }
      }
    ],
    "by_throughput": [
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 788,
          "throughput_tokens_per_sec": 22.85
        },
        "tokens_generated": 18,
        "elapsed_seconds": 0.788,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.5e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "ed44a67c6e81",
          "trace_id": "trace-ab8e0020"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            788
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            22.85
          ]
        }
      },
      {
        "provider": "DeepSeek",
        "model": "deepseek-chat",
        "metrics": {
          "latency_ms": 956,
          "throughput_tokens_per_sec": 14.65
        },
        "tokens_generated": 14,
        "elapsed_seconds": 0.956,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00027,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-12-01",
          "context_length": 64000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible",
          "schema_url": "https://api-docs.deepseek.com/",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "infra": {
          "schema_url": "https://api-docs.deepseek.com/",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "test_metadata": {
          "request_hash": "b3705450137a",
          "trace_id": "trace-f1e7a08c"
        },
        "history": {
          "latency_trend_7d": [
            280,
            278,
            276,
            277,
            279,
            278,
            278
          ],
          "throughput_trend_7d": [
            85.9,
            86.1,
            86.0,
            86.2,
            86.0,
            86.0,
            86.01
          ]
        }
      }
    ]
  },
  "fastest_latency": "OpenAI",
  "highest_throughput": "OpenAI",
  "total_tested": 10,
  "successful_tests": 2,
  "failed_tests": 8,
  "performance_summary": {
    "best_latency_ms": 788,
    "best_throughput_tokens_per_sec": 22.85,
    "avg_latency_ms": 872,
    "avg_throughput_tokens_per_sec": 18.75
  },
  "ai_guidance": {
    "best_for_speed": "OpenAI",
    "best_for_throughput": "OpenAI",
    "recommendation": "Use OpenAI for lowest latency, OpenAI for highest throughput",
    "use_case_guidance": {
      "real_time_chat": "Recommended: OpenAI (lowest latency)",
      "bulk_generation": "Recommended: OpenAI (highest throughput)",
      "balanced_workload": "Consider both OpenAI and OpenAI"
    }
  },
  "human_readable_summary": "\u26a1 Fastest: OpenAI (788ms - Moderate) | \ud83d\ude80 Highest throughput: OpenAI (22.9 tokens/sec - Moderate) | \u2705 2/10 providers responding",
  "timestamp": "2026-07-15T04:31:20.581349Z"
}
⚡ Throughput Analysis Results