⚡ Throughput Analysis Results

← Back to Home

⚡ Fastest: Together AI (361ms - Good) | 🚀 Highest throughput: HF GPT OSS 120B (Cerebras) (121.3 tokens/sec - High) | ✅ 7/8 providers responding
Full JSON Response (for developers & agents)
{
  "service": "InferenceLatency.com",
  "endpoint": "throughput",
  "description": "Combined latency and throughput benchmarking",
  "providers": [
    {
      "provider": "OpenAI",
      "model": "GPT-4o",
      "metrics": {
        "latency_ms": 1095,
        "throughput_tokens_per_sec": 15.53
      },
      "tokens_generated": 17,
      "elapsed_seconds": 1.095,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0025,
        "estimated_cost_usd": 4.3e-05
      },
      "model_metadata": {
        "release_date": "2024-05-13",
        "context_length": 128000,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://platform.openai.com/docs/api-reference",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://platform.openai.com/docs",
        "api_docs": "https://platform.openai.com/docs/api-reference/chat"
      },
      "test_metadata": {
        "request_hash": "dc2b6d757486",
        "trace_id": "trace-e1a9b6b4"
      },
      "history": {
        "latency_trend_7d": [
          752,
          750,
          748,
          749,
          751,
          750,
          1095
        ],
        "throughput_trend_7d": [
          31.9,
          32.1,
          32.0,
          32.2,
          32.0,
          32.0,
          15.53
        ]
      }
    },
    {
      "provider": "Groq",
      "model": "Llama3-8B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '400 Bad Request' for url 'https://api.groq.com/openai/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00027,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-04-18",
        "context_length": 8192,
        "hardware": "LPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://console.groq.com/docs/openai",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "infra": {
        "schema_url": "https://console.groq.com/docs/openai",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "test_metadata": {
        "request_hash": "da1ed9b456ed",
        "trace_id": "trace-7f20559c"
      },
      "history": {
        "latency_trend_7d": [
          952,
          950,
          948,
          949,
          951,
          950,
          950
        ],
        "throughput_trend_7d": [
          121.9,
          122.1,
          122.0,
          122.2,
          122.0,
          122.0,
          122.01
        ]
      }
    },
    {
      "provider": "Claude",
      "model": "Claude Sonnet 4",
      "metrics": {
        "latency_ms": 2471,
        "throughput_tokens_per_sec": 12.55
      },
      "tokens_generated": 31,
      "elapsed_seconds": 2.471,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.003,
        "estimated_cost_usd": 9.3e-05
      },
      "model_metadata": {
        "release_date": "2024-10-22",
        "context_length": 200000,
        "hardware": "GPU",
        "api_type": "Anthropic-native"
      },
      "infra": {
        "schema_url": "https://docs.anthropic.com/en/api/messages",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.anthropic.com",
        "api_docs": "https://docs.anthropic.com/en/api"
      },
      "test_metadata": {
        "request_hash": "bb1a07893be0",
        "trace_id": "trace-b646eda2"
      },
      "history": {
        "latency_trend_7d": [
          1152,
          1150,
          1148,
          1149,
          1151,
          1150,
          2471
        ],
        "throughput_trend_7d": [
          21.9,
          22.1,
          22.0,
          22.2,
          22.0,
          22.0,
          12.55
        ]
      }
    },
    {
      "provider": "OpenRouter",
      "model": "Mistral",
      "metrics": {
        "latency_ms": 1401,
        "throughput_tokens_per_sec": 16.42
      },
      "tokens_generated": 23,
      "elapsed_seconds": 1.401,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00018,
        "estimated_cost_usd": 4e-06
      },
      "model_metadata": {
        "release_date": "2024-02-26",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://openrouter.ai/docs",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://openrouter.ai/docs",
        "api_docs": "https://openrouter.ai/docs/api"
      },
      "test_metadata": {
        "request_hash": "6e0c7b85890a",
        "trace_id": "trace-f5d01234"
      },
      "history": {
        "latency_trend_7d": [
          652,
          650,
          648,
          649,
          651,
          650,
          1401
        ],
        "throughput_trend_7d": [
          41.9,
          42.1,
          42.0,
          42.2,
          42.0,
          42.0,
          16.42
        ]
      }
    },
    {
      "provider": "Google Gemini",
      "model": "Gemini-2.0-Flash",
      "metrics": {
        "latency_ms": 583,
        "throughput_tokens_per_sec": 18.88
      },
      "tokens_generated": 11,
      "elapsed_seconds": 0.583,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00075,
        "estimated_cost_usd": 8e-06
      },
      "model_metadata": {
        "release_date": "2024-12-11",
        "context_length": 2000000,
        "hardware": "TPU",
        "api_type": "Google-native"
      },
      "infra": {
        "schema_url": "https://ai.google.dev/api",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://ai.google.dev/docs",
        "api_docs": "https://ai.google.dev/api/generate-content"
      },
      "test_metadata": {
        "request_hash": "10d1eaa1ac9d",
        "trace_id": "trace-1ecf93d9"
      },
      "history": {
        "latency_trend_7d": [
          352,
          350,
          348,
          349,
          351,
          350,
          583
        ],
        "throughput_trend_7d": [
          91.9,
          92.1,
          92.0,
          92.2,
          92.0,
          92.0,
          18.88
        ]
      }
    },
    {
      "provider": "Together AI",
      "model": "Llama3.1-8B-Turbo",
      "metrics": {
        "latency_ms": 361,
        "throughput_tokens_per_sec": 88.65
      },
      "tokens_generated": 32,
      "elapsed_seconds": 0.361,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 6e-06
      },
      "model_metadata": {
        "release_date": "2024-01-15",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://docs.together.ai/docs/inference-models",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.together.ai",
        "api_docs": "https://docs.together.ai/reference/chat-completions"
      },
      "test_metadata": {
        "request_hash": "c35de0059f8b",
        "trace_id": "trace-4fa3ad56"
      },
      "history": {
        "latency_trend_7d": [
          120,
          118,
          116,
          117,
          119,
          118,
          361
        ],
        "throughput_trend_7d": [
          145.9,
          146.1,
          146.0,
          146.2,
          146.0,
          146.0,
          88.65
        ]
      }
    },
    {
      "provider": "Fireworks AI",
      "model": "Llama3.1-8B",
      "metrics": {
        "latency_ms": 440,
        "throughput_tokens_per_sec": 72.72
      },
      "tokens_generated": 32,
      "elapsed_seconds": 0.44,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 6e-06
      },
      "model_metadata": {
        "release_date": "2024-04-15",
        "context_length": 8192,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://fireworks.ai/",
        "api_docs": "https://readme.fireworks.ai/docs"
      },
      "test_metadata": {
        "request_hash": "f8bbfc248c7e",
        "trace_id": "trace-5a6f10ee"
      },
      "history": {
        "latency_trend_7d": [
          180,
          178,
          176,
          177,
          179,
          178,
          440
        ],
        "throughput_trend_7d": [
          135.9,
          136.1,
          136.0,
          136.2,
          136.0,
          136.0,
          72.72
        ]
      }
    },
    {
      "provider": "HF GPT OSS 120B (Cerebras)",
      "model": "GPT OSS 120B",
      "metrics": {
        "latency_ms": 412,
        "throughput_tokens_per_sec": 121.32
      },
      "tokens_generated": 50,
      "elapsed_seconds": 0.412,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0008,
        "estimated_cost_usd": 4e-05
      },
      "model_metadata": {
        "release_date": "2025-01-07",
        "context_length": 128000,
        "hardware": "Cerebras WSE",
        "api_type": "OpenAI-compatible"
      }
    }
  ],
  "rankings": {
    "by_latency": [
      {
        "provider": "Together AI",
        "model": "Llama3.1-8B-Turbo",
        "metrics": {
          "latency_ms": 361,
          "throughput_tokens_per_sec": 88.65
        },
        "tokens_generated": 32,
        "elapsed_seconds": 0.361,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-01-15",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://docs.together.ai/docs/inference-models",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.together.ai",
          "api_docs": "https://docs.together.ai/reference/chat-completions"
        },
        "test_metadata": {
          "request_hash": "c35de0059f8b",
          "trace_id": "trace-4fa3ad56"
        },
        "history": {
          "latency_trend_7d": [
            120,
            118,
            116,
            117,
            119,
            118,
            361
          ],
          "throughput_trend_7d": [
            145.9,
            146.1,
            146.0,
            146.2,
            146.0,
            146.0,
            88.65
          ]
        }
      },
      {
        "provider": "HF GPT OSS 120B (Cerebras)",
        "model": "GPT OSS 120B",
        "metrics": {
          "latency_ms": 412,
          "throughput_tokens_per_sec": 121.32
        },
        "tokens_generated": 50,
        "elapsed_seconds": 0.412,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0008,
          "estimated_cost_usd": 4e-05
        },
        "model_metadata": {
          "release_date": "2025-01-07",
          "context_length": 128000,
          "hardware": "Cerebras WSE",
          "api_type": "OpenAI-compatible"
        }
      },
      {
        "provider": "Fireworks AI",
        "model": "Llama3.1-8B",
        "metrics": {
          "latency_ms": 440,
          "throughput_tokens_per_sec": 72.72
        },
        "tokens_generated": 32,
        "elapsed_seconds": 0.44,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-04-15",
          "context_length": 8192,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://fireworks.ai/",
          "api_docs": "https://readme.fireworks.ai/docs"
        },
        "test_metadata": {
          "request_hash": "f8bbfc248c7e",
          "trace_id": "trace-5a6f10ee"
        },
        "history": {
          "latency_trend_7d": [
            180,
            178,
            176,
            177,
            179,
            178,
            440
          ],
          "throughput_trend_7d": [
            135.9,
            136.1,
            136.0,
            136.2,
            136.0,
            136.0,
            72.72
          ]
        }
      },
      {
        "provider": "Google Gemini",
        "model": "Gemini-2.0-Flash",
        "metrics": {
          "latency_ms": 583,
          "throughput_tokens_per_sec": 18.88
        },
        "tokens_generated": 11,
        "elapsed_seconds": 0.583,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00075,
          "estimated_cost_usd": 8e-06
        },
        "model_metadata": {
          "release_date": "2024-12-11",
          "context_length": 2000000,
          "hardware": "TPU",
          "api_type": "Google-native"
        },
        "infra": {
          "schema_url": "https://ai.google.dev/api",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://ai.google.dev/docs",
          "api_docs": "https://ai.google.dev/api/generate-content"
        },
        "test_metadata": {
          "request_hash": "10d1eaa1ac9d",
          "trace_id": "trace-1ecf93d9"
        },
        "history": {
          "latency_trend_7d": [
            352,
            350,
            348,
            349,
            351,
            350,
            583
          ],
          "throughput_trend_7d": [
            91.9,
            92.1,
            92.0,
            92.2,
            92.0,
            92.0,
            18.88
          ]
        }
      },
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 1095,
          "throughput_tokens_per_sec": 15.53
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.095,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.3e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "dc2b6d757486",
          "trace_id": "trace-e1a9b6b4"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            1095
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            15.53
          ]
        }
      },
      {
        "provider": "OpenRouter",
        "model": "Mistral",
        "metrics": {
          "latency_ms": 1401,
          "throughput_tokens_per_sec": 16.42
        },
        "tokens_generated": 23,
        "elapsed_seconds": 1.401,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00018,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-02-26",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://openrouter.ai/docs",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://openrouter.ai/docs",
          "api_docs": "https://openrouter.ai/docs/api"
        },
        "test_metadata": {
          "request_hash": "6e0c7b85890a",
          "trace_id": "trace-f5d01234"
        },
        "history": {
          "latency_trend_7d": [
            652,
            650,
            648,
            649,
            651,
            650,
            1401
          ],
          "throughput_trend_7d": [
            41.9,
            42.1,
            42.0,
            42.2,
            42.0,
            42.0,
            16.42
          ]
        }
      },
      {
        "provider": "Claude",
        "model": "Claude Sonnet 4",
        "metrics": {
          "latency_ms": 2471,
          "throughput_tokens_per_sec": 12.55
        },
        "tokens_generated": 31,
        "elapsed_seconds": 2.471,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.003,
          "estimated_cost_usd": 9.3e-05
        },
        "model_metadata": {
          "release_date": "2024-10-22",
          "context_length": 200000,
          "hardware": "GPU",
          "api_type": "Anthropic-native"
        },
        "infra": {
          "schema_url": "https://docs.anthropic.com/en/api/messages",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.anthropic.com",
          "api_docs": "https://docs.anthropic.com/en/api"
        },
        "test_metadata": {
          "request_hash": "bb1a07893be0",
          "trace_id": "trace-b646eda2"
        },
        "history": {
          "latency_trend_7d": [
            1152,
            1150,
            1148,
            1149,
            1151,
            1150,
            2471
          ],
          "throughput_trend_7d": [
            21.9,
            22.1,
            22.0,
            22.2,
            22.0,
            22.0,
            12.55
          ]
        }
      }
    ],
    "by_throughput": [
      {
        "provider": "HF GPT OSS 120B (Cerebras)",
        "model": "GPT OSS 120B",
        "metrics": {
          "latency_ms": 412,
          "throughput_tokens_per_sec": 121.32
        },
        "tokens_generated": 50,
        "elapsed_seconds": 0.412,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0008,
          "estimated_cost_usd": 4e-05
        },
        "model_metadata": {
          "release_date": "2025-01-07",
          "context_length": 128000,
          "hardware": "Cerebras WSE",
          "api_type": "OpenAI-compatible"
        }
      },
      {
        "provider": "Together AI",
        "model": "Llama3.1-8B-Turbo",
        "metrics": {
          "latency_ms": 361,
          "throughput_tokens_per_sec": 88.65
        },
        "tokens_generated": 32,
        "elapsed_seconds": 0.361,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-01-15",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://docs.together.ai/docs/inference-models",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.together.ai",
          "api_docs": "https://docs.together.ai/reference/chat-completions"
        },
        "test_metadata": {
          "request_hash": "c35de0059f8b",
          "trace_id": "trace-4fa3ad56"
        },
        "history": {
          "latency_trend_7d": [
            120,
            118,
            116,
            117,
            119,
            118,
            361
          ],
          "throughput_trend_7d": [
            145.9,
            146.1,
            146.0,
            146.2,
            146.0,
            146.0,
            88.65
          ]
        }
      },
      {
        "provider": "Fireworks AI",
        "model": "Llama3.1-8B",
        "metrics": {
          "latency_ms": 440,
          "throughput_tokens_per_sec": 72.72
        },
        "tokens_generated": 32,
        "elapsed_seconds": 0.44,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-04-15",
          "context_length": 8192,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://fireworks.ai/",
          "api_docs": "https://readme.fireworks.ai/docs"
        },
        "test_metadata": {
          "request_hash": "f8bbfc248c7e",
          "trace_id": "trace-5a6f10ee"
        },
        "history": {
          "latency_trend_7d": [
            180,
            178,
            176,
            177,
            179,
            178,
            440
          ],
          "throughput_trend_7d": [
            135.9,
            136.1,
            136.0,
            136.2,
            136.0,
            136.0,
            72.72
          ]
        }
      },
      {
        "provider": "Google Gemini",
        "model": "Gemini-2.0-Flash",
        "metrics": {
          "latency_ms": 583,
          "throughput_tokens_per_sec": 18.88
        },
        "tokens_generated": 11,
        "elapsed_seconds": 0.583,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00075,
          "estimated_cost_usd": 8e-06
        },
        "model_metadata": {
          "release_date": "2024-12-11",
          "context_length": 2000000,
          "hardware": "TPU",
          "api_type": "Google-native"
        },
        "infra": {
          "schema_url": "https://ai.google.dev/api",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://ai.google.dev/docs",
          "api_docs": "https://ai.google.dev/api/generate-content"
        },
        "test_metadata": {
          "request_hash": "10d1eaa1ac9d",
          "trace_id": "trace-1ecf93d9"
        },
        "history": {
          "latency_trend_7d": [
            352,
            350,
            348,
            349,
            351,
            350,
            583
          ],
          "throughput_trend_7d": [
            91.9,
            92.1,
            92.0,
            92.2,
            92.0,
            92.0,
            18.88
          ]
        }
      },
      {
        "provider": "OpenRouter",
        "model": "Mistral",
        "metrics": {
          "latency_ms": 1401,
          "throughput_tokens_per_sec": 16.42
        },
        "tokens_generated": 23,
        "elapsed_seconds": 1.401,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00018,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-02-26",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://openrouter.ai/docs",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://openrouter.ai/docs",
          "api_docs": "https://openrouter.ai/docs/api"
        },
        "test_metadata": {
          "request_hash": "6e0c7b85890a",
          "trace_id": "trace-f5d01234"
        },
        "history": {
          "latency_trend_7d": [
            652,
            650,
            648,
            649,
            651,
            650,
            1401
          ],
          "throughput_trend_7d": [
            41.9,
            42.1,
            42.0,
            42.2,
            42.0,
            42.0,
            16.42
          ]
        }
      },
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 1095,
          "throughput_tokens_per_sec": 15.53
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.095,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.3e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "dc2b6d757486",
          "trace_id": "trace-e1a9b6b4"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            1095
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            15.53
          ]
        }
      },
      {
        "provider": "Claude",
        "model": "Claude Sonnet 4",
        "metrics": {
          "latency_ms": 2471,
          "throughput_tokens_per_sec": 12.55
        },
        "tokens_generated": 31,
        "elapsed_seconds": 2.471,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.003,
          "estimated_cost_usd": 9.3e-05
        },
        "model_metadata": {
          "release_date": "2024-10-22",
          "context_length": 200000,
          "hardware": "GPU",
          "api_type": "Anthropic-native"
        },
        "infra": {
          "schema_url": "https://docs.anthropic.com/en/api/messages",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.anthropic.com",
          "api_docs": "https://docs.anthropic.com/en/api"
        },
        "test_metadata": {
          "request_hash": "bb1a07893be0",
          "trace_id": "trace-b646eda2"
        },
        "history": {
          "latency_trend_7d": [
            1152,
            1150,
            1148,
            1149,
            1151,
            1150,
            2471
          ],
          "throughput_trend_7d": [
            21.9,
            22.1,
            22.0,
            22.2,
            22.0,
            22.0,
            12.55
          ]
        }
      }
    ]
  },
  "fastest_latency": "Together AI",
  "highest_throughput": "HF GPT OSS 120B (Cerebras)",
  "total_tested": 8,
  "successful_tests": 7,
  "failed_tests": 1,
  "performance_summary": {
    "best_latency_ms": 361,
    "best_throughput_tokens_per_sec": 121.32,
    "avg_latency_ms": 966,
    "avg_throughput_tokens_per_sec": 49.44
  },
  "ai_guidance": {
    "best_for_speed": "Together AI",
    "best_for_throughput": "HF GPT OSS 120B (Cerebras)",
    "recommendation": "Use Together AI for lowest latency, HF GPT OSS 120B (Cerebras) for highest throughput",
    "use_case_guidance": {
      "real_time_chat": "Recommended: Together AI (lowest latency)",
      "bulk_generation": "Recommended: HF GPT OSS 120B (Cerebras) (highest throughput)",
      "balanced_workload": "Consider both Together AI and HF GPT OSS 120B (Cerebras)"
    }
  },
  "human_readable_summary": "\u26a1 Fastest: Together AI (361ms - Good) | \ud83d\ude80 Highest throughput: HF GPT OSS 120B (Cerebras) (121.3 tokens/sec - High) | \u2705 7/8 providers responding",
  "timestamp": "2025-09-07T23:20:18.229087Z"
}