⚡ Throughput Analysis Results

← Back to Home

⚡ Fastest: Together AI (340ms - Good) | 🚀 Highest throughput: Together AI (91.1 tokens/sec - High) | ✅ 5/10 providers responding
Full JSON Response (for developers & agents)
{
  "service": "InferenceLatency.com",
  "endpoint": "throughput",
  "description": "Combined latency and throughput benchmarking",
  "providers": [
    {
      "provider": "OpenAI",
      "model": "GPT-4o",
      "metrics": {
        "latency_ms": 1173,
        "throughput_tokens_per_sec": 14.49
      },
      "tokens_generated": 17,
      "elapsed_seconds": 1.173,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0025,
        "estimated_cost_usd": 4.3e-05
      },
      "model_metadata": {
        "release_date": "2024-05-13",
        "context_length": 128000,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://platform.openai.com/docs/api-reference",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://platform.openai.com/docs",
        "api_docs": "https://platform.openai.com/docs/api-reference/chat"
      },
      "test_metadata": {
        "request_hash": "126c4dab7b56",
        "trace_id": "trace-4f8e2659"
      },
      "history": {
        "latency_trend_7d": [
          752,
          750,
          748,
          749,
          751,
          750,
          1173
        ],
        "throughput_trend_7d": [
          31.9,
          32.1,
          32.0,
          32.2,
          32.0,
          32.0,
          14.49
        ]
      }
    },
    {
      "provider": "Groq",
      "model": "Llama3-8B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '400 Bad Request' for url 'https://api.groq.com/openai/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00027,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-04-18",
        "context_length": 8192,
        "hardware": "LPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://console.groq.com/docs/openai",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "infra": {
        "schema_url": "https://console.groq.com/docs/openai",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://console.groq.com/docs",
        "api_docs": "https://console.groq.com/docs/api-reference"
      },
      "test_metadata": {
        "request_hash": "d739ea0c736f",
        "trace_id": "trace-c1ef7e3a"
      },
      "history": {
        "latency_trend_7d": [
          952,
          950,
          948,
          949,
          951,
          950,
          950
        ],
        "throughput_trend_7d": [
          121.9,
          122.1,
          122.0,
          122.2,
          122.0,
          122.0,
          122.01
        ]
      }
    },
    {
      "provider": "Claude",
      "model": "Claude Sonnet 4",
      "metrics": {
        "latency_ms": 2414,
        "throughput_tokens_per_sec": 12.84
      },
      "tokens_generated": 31,
      "elapsed_seconds": 2.414,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.003,
        "estimated_cost_usd": 9.3e-05
      },
      "model_metadata": {
        "release_date": "2024-10-22",
        "context_length": 200000,
        "hardware": "GPU",
        "api_type": "Anthropic-native"
      },
      "infra": {
        "schema_url": "https://docs.anthropic.com/en/api/messages",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.anthropic.com",
        "api_docs": "https://docs.anthropic.com/en/api"
      },
      "test_metadata": {
        "request_hash": "d480d15bd31b",
        "trace_id": "trace-7ebb82bf"
      },
      "history": {
        "latency_trend_7d": [
          1152,
          1150,
          1148,
          1149,
          1151,
          1150,
          2414
        ],
        "throughput_trend_7d": [
          21.9,
          22.1,
          22.0,
          22.2,
          22.0,
          22.0,
          12.84
        ]
      }
    },
    {
      "provider": "OpenRouter",
      "model": "Mistral",
      "metrics": {
        "latency_ms": 707,
        "throughput_tokens_per_sec": 29.69
      },
      "tokens_generated": 21,
      "elapsed_seconds": 0.707,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00018,
        "estimated_cost_usd": 4e-06
      },
      "model_metadata": {
        "release_date": "2024-02-26",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://openrouter.ai/docs",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://openrouter.ai/docs",
        "api_docs": "https://openrouter.ai/docs/api"
      },
      "test_metadata": {
        "request_hash": "10d6566d0f98",
        "trace_id": "trace-26568474"
      },
      "history": {
        "latency_trend_7d": [
          652,
          650,
          648,
          649,
          651,
          650,
          707
        ],
        "throughput_trend_7d": [
          41.9,
          42.1,
          42.0,
          42.2,
          42.0,
          42.0,
          29.69
        ]
      }
    },
    {
      "provider": "Google Gemini",
      "model": "Gemini-2.0-Flash",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\\nPlease retry in 55.092339557s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-exp'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.0-flash-exp', 'location': 'global'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-exp'}}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '55s'}]}}",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00075,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-12-11",
        "context_length": 2000000,
        "hardware": "TPU",
        "api_type": "Google-native",
        "schema_url": "https://ai.google.dev/api",
        "human_readable_url": "https://ai.google.dev/docs",
        "api_docs": "https://ai.google.dev/api/generate-content"
      },
      "infra": {
        "schema_url": "https://ai.google.dev/api",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://ai.google.dev/docs",
        "api_docs": "https://ai.google.dev/api/generate-content"
      },
      "test_metadata": {
        "request_hash": "94590d1b6c3d",
        "trace_id": "trace-a8d6f6fc"
      },
      "history": {
        "latency_trend_7d": [
          352,
          350,
          348,
          349,
          351,
          350,
          350
        ],
        "throughput_trend_7d": [
          91.9,
          92.1,
          92.0,
          92.2,
          92.0,
          92.0,
          92.01
        ]
      }
    },
    {
      "provider": "Together AI",
      "model": "Llama3.1-8B-Turbo",
      "metrics": {
        "latency_ms": 340,
        "throughput_tokens_per_sec": 91.08
      },
      "tokens_generated": 31,
      "elapsed_seconds": 0.34,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 6e-06
      },
      "model_metadata": {
        "release_date": "2024-01-15",
        "context_length": 32768,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible"
      },
      "infra": {
        "schema_url": "https://docs.together.ai/docs/inference-models",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.together.ai",
        "api_docs": "https://docs.together.ai/reference/chat-completions"
      },
      "test_metadata": {
        "request_hash": "fb68f5c0eeec",
        "trace_id": "trace-03420370"
      },
      "history": {
        "latency_trend_7d": [
          120,
          118,
          116,
          117,
          119,
          118,
          340
        ],
        "throughput_trend_7d": [
          145.9,
          146.1,
          146.0,
          146.2,
          146.0,
          146.0,
          91.08
        ]
      }
    },
    {
      "provider": "Fireworks AI",
      "model": "Llama3.1-8B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '404 Not Found' for url 'https://api.fireworks.ai/inference/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0002,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-04-15",
        "context_length": 8192,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
        "human_readable_url": "https://fireworks.ai/",
        "api_docs": "https://readme.fireworks.ai/docs"
      },
      "infra": {
        "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://fireworks.ai/",
        "api_docs": "https://readme.fireworks.ai/docs"
      },
      "test_metadata": {
        "request_hash": "8d13b5b3b4f6",
        "trace_id": "trace-e30903f9"
      },
      "history": {
        "latency_trend_7d": [
          180,
          178,
          176,
          177,
          179,
          178,
          178
        ],
        "throughput_trend_7d": [
          135.9,
          136.1,
          136.0,
          136.2,
          136.0,
          136.0,
          136.01
        ]
      }
    },
    {
      "provider": "HF GPT OSS 120B (Cerebras)",
      "model": "GPT OSS 120B",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '401 Unauthorized' for url 'https://router.huggingface.co/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.0008,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2025-01-07",
        "context_length": 128000,
        "hardware": "Cerebras WSE",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://huggingface.co/docs/inference-providers",
        "human_readable_url": "https://huggingface.co/openai/gpt-oss-120b",
        "api_docs": "https://huggingface.co/docs/inference-providers/en/guides/gpt-oss"
      }
    },
    {
      "provider": "DeepSeek",
      "model": "deepseek-chat",
      "metrics": {
        "latency_ms": 1752,
        "throughput_tokens_per_sec": 9.71
      },
      "tokens_generated": 17,
      "elapsed_seconds": 1.752,
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 100.0,
        "error_rate_percent": 0.0,
        "status": "healthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00027,
        "estimated_cost_usd": 5e-06
      },
      "model_metadata": {
        "release_date": "2024-12-01",
        "context_length": 64000,
        "hardware": "GPU",
        "api_type": "OpenAI-compatible",
        "schema_url": "https://api-docs.deepseek.com/",
        "human_readable_url": "https://platform.deepseek.com/",
        "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
      },
      "infra": {
        "schema_url": "https://api-docs.deepseek.com/",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://platform.deepseek.com/",
        "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
      },
      "test_metadata": {
        "request_hash": "f29beaebe905",
        "trace_id": "trace-a44b493a"
      },
      "history": {
        "latency_trend_7d": [
          280,
          278,
          276,
          277,
          279,
          278,
          278
        ],
        "throughput_trend_7d": [
          85.9,
          86.1,
          86.0,
          86.2,
          86.0,
          86.0,
          86.01
        ]
      }
    },
    {
      "provider": "Cohere",
      "model": "command-r",
      "metrics": {
        "latency_ms": null,
        "throughput_tokens_per_sec": null
      },
      "error": "Client error '404 Not Found' for url 'https://api.cohere.ai/v1/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
      "health": {
        "cold_start_latency_ms": null,
        "warm_start_latency_ms": 750,
        "availability_percent": 0.0,
        "error_rate_percent": 100.0,
        "status": "unhealthy",
        "is_cold_start": false
      },
      "cost_estimate": {
        "cost_per_1k_tokens_usd": 0.00075,
        "estimated_cost_usd": 0.0
      },
      "model_metadata": {
        "release_date": "2024-10-01",
        "context_length": 128000,
        "hardware": "GPU",
        "api_type": "Cohere-native",
        "schema_url": "https://docs.cohere.com/reference/chat",
        "human_readable_url": "https://docs.cohere.com/",
        "api_docs": "https://docs.cohere.com/reference/chat"
      },
      "infra": {
        "schema_url": "https://docs.cohere.com/reference/chat",
        "plugin_manifest": "/.well-known/ai-plugin.json",
        "human_readable_url": "https://docs.cohere.com/",
        "api_docs": "https://docs.cohere.com/reference/chat"
      },
      "test_metadata": {
        "request_hash": "cd16c98f23fa",
        "trace_id": "trace-d64ef776"
      },
      "history": {
        "latency_trend_7d": [
          320,
          318,
          316,
          317,
          319,
          318,
          318
        ],
        "throughput_trend_7d": [
          45.9,
          46.1,
          46.0,
          46.2,
          46.0,
          46.0,
          46.01
        ]
      }
    }
  ],
  "rankings": {
    "by_latency": [
      {
        "provider": "Together AI",
        "model": "Llama3.1-8B-Turbo",
        "metrics": {
          "latency_ms": 340,
          "throughput_tokens_per_sec": 91.08
        },
        "tokens_generated": 31,
        "elapsed_seconds": 0.34,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-01-15",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://docs.together.ai/docs/inference-models",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.together.ai",
          "api_docs": "https://docs.together.ai/reference/chat-completions"
        },
        "test_metadata": {
          "request_hash": "fb68f5c0eeec",
          "trace_id": "trace-03420370"
        },
        "history": {
          "latency_trend_7d": [
            120,
            118,
            116,
            117,
            119,
            118,
            340
          ],
          "throughput_trend_7d": [
            145.9,
            146.1,
            146.0,
            146.2,
            146.0,
            146.0,
            91.08
          ]
        }
      },
      {
        "provider": "OpenRouter",
        "model": "Mistral",
        "metrics": {
          "latency_ms": 707,
          "throughput_tokens_per_sec": 29.69
        },
        "tokens_generated": 21,
        "elapsed_seconds": 0.707,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00018,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-02-26",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://openrouter.ai/docs",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://openrouter.ai/docs",
          "api_docs": "https://openrouter.ai/docs/api"
        },
        "test_metadata": {
          "request_hash": "10d6566d0f98",
          "trace_id": "trace-26568474"
        },
        "history": {
          "latency_trend_7d": [
            652,
            650,
            648,
            649,
            651,
            650,
            707
          ],
          "throughput_trend_7d": [
            41.9,
            42.1,
            42.0,
            42.2,
            42.0,
            42.0,
            29.69
          ]
        }
      },
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 1173,
          "throughput_tokens_per_sec": 14.49
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.173,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.3e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "126c4dab7b56",
          "trace_id": "trace-4f8e2659"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            1173
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            14.49
          ]
        }
      },
      {
        "provider": "DeepSeek",
        "model": "deepseek-chat",
        "metrics": {
          "latency_ms": 1752,
          "throughput_tokens_per_sec": 9.71
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.752,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00027,
          "estimated_cost_usd": 5e-06
        },
        "model_metadata": {
          "release_date": "2024-12-01",
          "context_length": 64000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible",
          "schema_url": "https://api-docs.deepseek.com/",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "infra": {
          "schema_url": "https://api-docs.deepseek.com/",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "test_metadata": {
          "request_hash": "f29beaebe905",
          "trace_id": "trace-a44b493a"
        },
        "history": {
          "latency_trend_7d": [
            280,
            278,
            276,
            277,
            279,
            278,
            278
          ],
          "throughput_trend_7d": [
            85.9,
            86.1,
            86.0,
            86.2,
            86.0,
            86.0,
            86.01
          ]
        }
      },
      {
        "provider": "Claude",
        "model": "Claude Sonnet 4",
        "metrics": {
          "latency_ms": 2414,
          "throughput_tokens_per_sec": 12.84
        },
        "tokens_generated": 31,
        "elapsed_seconds": 2.414,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.003,
          "estimated_cost_usd": 9.3e-05
        },
        "model_metadata": {
          "release_date": "2024-10-22",
          "context_length": 200000,
          "hardware": "GPU",
          "api_type": "Anthropic-native"
        },
        "infra": {
          "schema_url": "https://docs.anthropic.com/en/api/messages",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.anthropic.com",
          "api_docs": "https://docs.anthropic.com/en/api"
        },
        "test_metadata": {
          "request_hash": "d480d15bd31b",
          "trace_id": "trace-7ebb82bf"
        },
        "history": {
          "latency_trend_7d": [
            1152,
            1150,
            1148,
            1149,
            1151,
            1150,
            2414
          ],
          "throughput_trend_7d": [
            21.9,
            22.1,
            22.0,
            22.2,
            22.0,
            22.0,
            12.84
          ]
        }
      }
    ],
    "by_throughput": [
      {
        "provider": "Together AI",
        "model": "Llama3.1-8B-Turbo",
        "metrics": {
          "latency_ms": 340,
          "throughput_tokens_per_sec": 91.08
        },
        "tokens_generated": 31,
        "elapsed_seconds": 0.34,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0002,
          "estimated_cost_usd": 6e-06
        },
        "model_metadata": {
          "release_date": "2024-01-15",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://docs.together.ai/docs/inference-models",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.together.ai",
          "api_docs": "https://docs.together.ai/reference/chat-completions"
        },
        "test_metadata": {
          "request_hash": "fb68f5c0eeec",
          "trace_id": "trace-03420370"
        },
        "history": {
          "latency_trend_7d": [
            120,
            118,
            116,
            117,
            119,
            118,
            340
          ],
          "throughput_trend_7d": [
            145.9,
            146.1,
            146.0,
            146.2,
            146.0,
            146.0,
            91.08
          ]
        }
      },
      {
        "provider": "OpenRouter",
        "model": "Mistral",
        "metrics": {
          "latency_ms": 707,
          "throughput_tokens_per_sec": 29.69
        },
        "tokens_generated": 21,
        "elapsed_seconds": 0.707,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00018,
          "estimated_cost_usd": 4e-06
        },
        "model_metadata": {
          "release_date": "2024-02-26",
          "context_length": 32768,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://openrouter.ai/docs",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://openrouter.ai/docs",
          "api_docs": "https://openrouter.ai/docs/api"
        },
        "test_metadata": {
          "request_hash": "10d6566d0f98",
          "trace_id": "trace-26568474"
        },
        "history": {
          "latency_trend_7d": [
            652,
            650,
            648,
            649,
            651,
            650,
            707
          ],
          "throughput_trend_7d": [
            41.9,
            42.1,
            42.0,
            42.2,
            42.0,
            42.0,
            29.69
          ]
        }
      },
      {
        "provider": "OpenAI",
        "model": "GPT-4o",
        "metrics": {
          "latency_ms": 1173,
          "throughput_tokens_per_sec": 14.49
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.173,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.0025,
          "estimated_cost_usd": 4.3e-05
        },
        "model_metadata": {
          "release_date": "2024-05-13",
          "context_length": 128000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible"
        },
        "infra": {
          "schema_url": "https://platform.openai.com/docs/api-reference",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.openai.com/docs",
          "api_docs": "https://platform.openai.com/docs/api-reference/chat"
        },
        "test_metadata": {
          "request_hash": "126c4dab7b56",
          "trace_id": "trace-4f8e2659"
        },
        "history": {
          "latency_trend_7d": [
            752,
            750,
            748,
            749,
            751,
            750,
            1173
          ],
          "throughput_trend_7d": [
            31.9,
            32.1,
            32.0,
            32.2,
            32.0,
            32.0,
            14.49
          ]
        }
      },
      {
        "provider": "Claude",
        "model": "Claude Sonnet 4",
        "metrics": {
          "latency_ms": 2414,
          "throughput_tokens_per_sec": 12.84
        },
        "tokens_generated": 31,
        "elapsed_seconds": 2.414,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.003,
          "estimated_cost_usd": 9.3e-05
        },
        "model_metadata": {
          "release_date": "2024-10-22",
          "context_length": 200000,
          "hardware": "GPU",
          "api_type": "Anthropic-native"
        },
        "infra": {
          "schema_url": "https://docs.anthropic.com/en/api/messages",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://docs.anthropic.com",
          "api_docs": "https://docs.anthropic.com/en/api"
        },
        "test_metadata": {
          "request_hash": "d480d15bd31b",
          "trace_id": "trace-7ebb82bf"
        },
        "history": {
          "latency_trend_7d": [
            1152,
            1150,
            1148,
            1149,
            1151,
            1150,
            2414
          ],
          "throughput_trend_7d": [
            21.9,
            22.1,
            22.0,
            22.2,
            22.0,
            22.0,
            12.84
          ]
        }
      },
      {
        "provider": "DeepSeek",
        "model": "deepseek-chat",
        "metrics": {
          "latency_ms": 1752,
          "throughput_tokens_per_sec": 9.71
        },
        "tokens_generated": 17,
        "elapsed_seconds": 1.752,
        "health": {
          "cold_start_latency_ms": null,
          "warm_start_latency_ms": 750,
          "availability_percent": 100.0,
          "error_rate_percent": 0.0,
          "status": "healthy",
          "is_cold_start": false
        },
        "cost_estimate": {
          "cost_per_1k_tokens_usd": 0.00027,
          "estimated_cost_usd": 5e-06
        },
        "model_metadata": {
          "release_date": "2024-12-01",
          "context_length": 64000,
          "hardware": "GPU",
          "api_type": "OpenAI-compatible",
          "schema_url": "https://api-docs.deepseek.com/",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "infra": {
          "schema_url": "https://api-docs.deepseek.com/",
          "plugin_manifest": "/.well-known/ai-plugin.json",
          "human_readable_url": "https://platform.deepseek.com/",
          "api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
        },
        "test_metadata": {
          "request_hash": "f29beaebe905",
          "trace_id": "trace-a44b493a"
        },
        "history": {
          "latency_trend_7d": [
            280,
            278,
            276,
            277,
            279,
            278,
            278
          ],
          "throughput_trend_7d": [
            85.9,
            86.1,
            86.0,
            86.2,
            86.0,
            86.0,
            86.01
          ]
        }
      }
    ]
  },
  "fastest_latency": "Together AI",
  "highest_throughput": "Together AI",
  "total_tested": 10,
  "successful_tests": 5,
  "failed_tests": 5,
  "performance_summary": {
    "best_latency_ms": 340,
    "best_throughput_tokens_per_sec": 91.08,
    "avg_latency_ms": 1277,
    "avg_throughput_tokens_per_sec": 31.56
  },
  "ai_guidance": {
    "best_for_speed": "Together AI",
    "best_for_throughput": "Together AI",
    "recommendation": "Use Together AI for lowest latency, Together AI for highest throughput",
    "use_case_guidance": {
      "real_time_chat": "Recommended: Together AI (lowest latency)",
      "bulk_generation": "Recommended: Together AI (highest throughput)",
      "balanced_workload": "Consider both Together AI and Together AI"
    }
  },
  "human_readable_summary": "\u26a1 Fastest: Together AI (340ms - Good) | \ud83d\ude80 Highest throughput: Together AI (91.1 tokens/sec - High) | \u2705 5/10 providers responding",
  "timestamp": "2026-01-15T05:04:06.694474Z"
}