{ "service": "InferenceLatency.com", "endpoint": "throughput", "description": "Combined latency and throughput benchmarking", "providers": [ { "provider": "OpenAI", "model": "GPT-4o", "metrics": { "latency_ms": 1095, "throughput_tokens_per_sec": 15.53 }, "tokens_generated": 17, "elapsed_seconds": 1.095, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0025, "estimated_cost_usd": 4.3e-05 }, "model_metadata": { "release_date": "2024-05-13", "context_length": 128000, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://platform.openai.com/docs/api-reference", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://platform.openai.com/docs", "api_docs": "https://platform.openai.com/docs/api-reference/chat" }, "test_metadata": { "request_hash": "dc2b6d757486", "trace_id": "trace-e1a9b6b4" }, "history": { "latency_trend_7d": [ 752, 750, 748, 749, 751, 750, 1095 ], "throughput_trend_7d": [ 31.9, 32.1, 32.0, 32.2, 32.0, 32.0, 15.53 ] } }, { "provider": "Groq", "model": "Llama3-8B", "metrics": { "latency_ms": null, "throughput_tokens_per_sec": null }, "error": "Client error '400 Bad Request' for url 'https://api.groq.com/openai/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400", "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 0.0, "error_rate_percent": 100.0, "status": "unhealthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00027, "estimated_cost_usd": 0.0 }, "model_metadata": { "release_date": "2024-04-18", "context_length": 8192, "hardware": "LPU", "api_type": "OpenAI-compatible", "schema_url": "https://console.groq.com/docs/openai", "human_readable_url": "https://console.groq.com/docs", "api_docs": "https://console.groq.com/docs/api-reference" }, "infra": { "schema_url": "https://console.groq.com/docs/openai", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://console.groq.com/docs", "api_docs": "https://console.groq.com/docs/api-reference" }, "test_metadata": { "request_hash": "da1ed9b456ed", "trace_id": "trace-7f20559c" }, "history": { "latency_trend_7d": [ 952, 950, 948, 949, 951, 950, 950 ], "throughput_trend_7d": [ 121.9, 122.1, 122.0, 122.2, 122.0, 122.0, 122.01 ] } }, { "provider": "Claude", "model": "Claude Sonnet 4", "metrics": { "latency_ms": 2471, "throughput_tokens_per_sec": 12.55 }, "tokens_generated": 31, "elapsed_seconds": 2.471, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.003, "estimated_cost_usd": 9.3e-05 }, "model_metadata": { "release_date": "2024-10-22", "context_length": 200000, "hardware": "GPU", "api_type": "Anthropic-native" }, "infra": { "schema_url": "https://docs.anthropic.com/en/api/messages", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.anthropic.com", "api_docs": "https://docs.anthropic.com/en/api" }, "test_metadata": { "request_hash": "bb1a07893be0", "trace_id": "trace-b646eda2" }, "history": { "latency_trend_7d": [ 1152, 1150, 1148, 1149, 1151, 1150, 2471 ], "throughput_trend_7d": [ 21.9, 22.1, 22.0, 22.2, 22.0, 22.0, 12.55 ] } }, { "provider": "OpenRouter", "model": "Mistral", "metrics": { "latency_ms": 1401, "throughput_tokens_per_sec": 16.42 }, "tokens_generated": 23, "elapsed_seconds": 1.401, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00018, "estimated_cost_usd": 4e-06 }, "model_metadata": { "release_date": "2024-02-26", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://openrouter.ai/docs", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://openrouter.ai/docs", "api_docs": "https://openrouter.ai/docs/api" }, "test_metadata": { "request_hash": "6e0c7b85890a", "trace_id": "trace-f5d01234" }, "history": { "latency_trend_7d": [ 652, 650, 648, 649, 651, 650, 1401 ], "throughput_trend_7d": [ 41.9, 42.1, 42.0, 42.2, 42.0, 42.0, 16.42 ] } }, { "provider": "Google Gemini", "model": "Gemini-2.0-Flash", "metrics": { "latency_ms": 583, "throughput_tokens_per_sec": 18.88 }, "tokens_generated": 11, "elapsed_seconds": 0.583, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00075, "estimated_cost_usd": 8e-06 }, "model_metadata": { "release_date": "2024-12-11", "context_length": 2000000, "hardware": "TPU", "api_type": "Google-native" }, "infra": { "schema_url": "https://ai.google.dev/api", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://ai.google.dev/docs", "api_docs": "https://ai.google.dev/api/generate-content" }, "test_metadata": { "request_hash": "10d1eaa1ac9d", "trace_id": "trace-1ecf93d9" }, "history": { "latency_trend_7d": [ 352, 350, 348, 349, 351, 350, 583 ], "throughput_trend_7d": [ 91.9, 92.1, 92.0, 92.2, 92.0, 92.0, 18.88 ] } }, { "provider": "Together AI", "model": "Llama3.1-8B-Turbo", "metrics": { "latency_ms": 361, "throughput_tokens_per_sec": 88.65 }, "tokens_generated": 32, "elapsed_seconds": 0.361, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-01-15", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://docs.together.ai/docs/inference-models", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.together.ai", "api_docs": "https://docs.together.ai/reference/chat-completions" }, "test_metadata": { "request_hash": "c35de0059f8b", "trace_id": "trace-4fa3ad56" }, "history": { "latency_trend_7d": [ 120, 118, 116, 117, 119, 118, 361 ], "throughput_trend_7d": [ 145.9, 146.1, 146.0, 146.2, 146.0, 146.0, 88.65 ] } }, { "provider": "Fireworks AI", "model": "Llama3.1-8B", "metrics": { "latency_ms": 440, "throughput_tokens_per_sec": 72.72 }, "tokens_generated": 32, "elapsed_seconds": 0.44, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-04-15", "context_length": 8192, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://fireworks.ai/", "api_docs": "https://readme.fireworks.ai/docs" }, "test_metadata": { "request_hash": "f8bbfc248c7e", "trace_id": "trace-5a6f10ee" }, "history": { "latency_trend_7d": [ 180, 178, 176, 177, 179, 178, 440 ], "throughput_trend_7d": [ 135.9, 136.1, 136.0, 136.2, 136.0, 136.0, 72.72 ] } }, { "provider": "HF GPT OSS 120B (Cerebras)", "model": "GPT OSS 120B", "metrics": { "latency_ms": 412, "throughput_tokens_per_sec": 121.32 }, "tokens_generated": 50, "elapsed_seconds": 0.412, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0008, "estimated_cost_usd": 4e-05 }, "model_metadata": { "release_date": "2025-01-07", "context_length": 128000, "hardware": "Cerebras WSE", "api_type": "OpenAI-compatible" } } ], "rankings": { "by_latency": [ { "provider": "Together AI", "model": "Llama3.1-8B-Turbo", "metrics": { "latency_ms": 361, "throughput_tokens_per_sec": 88.65 }, "tokens_generated": 32, "elapsed_seconds": 0.361, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-01-15", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://docs.together.ai/docs/inference-models", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.together.ai", "api_docs": "https://docs.together.ai/reference/chat-completions" }, "test_metadata": { "request_hash": "c35de0059f8b", "trace_id": "trace-4fa3ad56" }, "history": { "latency_trend_7d": [ 120, 118, 116, 117, 119, 118, 361 ], "throughput_trend_7d": [ 145.9, 146.1, 146.0, 146.2, 146.0, 146.0, 88.65 ] } }, { "provider": "HF GPT OSS 120B (Cerebras)", "model": "GPT OSS 120B", "metrics": { "latency_ms": 412, "throughput_tokens_per_sec": 121.32 }, "tokens_generated": 50, "elapsed_seconds": 0.412, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0008, "estimated_cost_usd": 4e-05 }, "model_metadata": { "release_date": "2025-01-07", "context_length": 128000, "hardware": "Cerebras WSE", "api_type": "OpenAI-compatible" } }, { "provider": "Fireworks AI", "model": "Llama3.1-8B", "metrics": { "latency_ms": 440, "throughput_tokens_per_sec": 72.72 }, "tokens_generated": 32, "elapsed_seconds": 0.44, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-04-15", "context_length": 8192, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://fireworks.ai/", "api_docs": "https://readme.fireworks.ai/docs" }, "test_metadata": { "request_hash": "f8bbfc248c7e", "trace_id": "trace-5a6f10ee" }, "history": { "latency_trend_7d": [ 180, 178, 176, 177, 179, 178, 440 ], "throughput_trend_7d": [ 135.9, 136.1, 136.0, 136.2, 136.0, 136.0, 72.72 ] } }, { "provider": "Google Gemini", "model": "Gemini-2.0-Flash", "metrics": { "latency_ms": 583, "throughput_tokens_per_sec": 18.88 }, "tokens_generated": 11, "elapsed_seconds": 0.583, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00075, "estimated_cost_usd": 8e-06 }, "model_metadata": { "release_date": "2024-12-11", "context_length": 2000000, "hardware": "TPU", "api_type": "Google-native" }, "infra": { "schema_url": "https://ai.google.dev/api", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://ai.google.dev/docs", "api_docs": "https://ai.google.dev/api/generate-content" }, "test_metadata": { "request_hash": "10d1eaa1ac9d", "trace_id": "trace-1ecf93d9" }, "history": { "latency_trend_7d": [ 352, 350, 348, 349, 351, 350, 583 ], "throughput_trend_7d": [ 91.9, 92.1, 92.0, 92.2, 92.0, 92.0, 18.88 ] } }, { "provider": "OpenAI", "model": "GPT-4o", "metrics": { "latency_ms": 1095, "throughput_tokens_per_sec": 15.53 }, "tokens_generated": 17, "elapsed_seconds": 1.095, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0025, "estimated_cost_usd": 4.3e-05 }, "model_metadata": { "release_date": "2024-05-13", "context_length": 128000, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://platform.openai.com/docs/api-reference", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://platform.openai.com/docs", "api_docs": "https://platform.openai.com/docs/api-reference/chat" }, "test_metadata": { "request_hash": "dc2b6d757486", "trace_id": "trace-e1a9b6b4" }, "history": { "latency_trend_7d": [ 752, 750, 748, 749, 751, 750, 1095 ], "throughput_trend_7d": [ 31.9, 32.1, 32.0, 32.2, 32.0, 32.0, 15.53 ] } }, { "provider": "OpenRouter", "model": "Mistral", "metrics": { "latency_ms": 1401, "throughput_tokens_per_sec": 16.42 }, "tokens_generated": 23, "elapsed_seconds": 1.401, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00018, "estimated_cost_usd": 4e-06 }, "model_metadata": { "release_date": "2024-02-26", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://openrouter.ai/docs", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://openrouter.ai/docs", "api_docs": "https://openrouter.ai/docs/api" }, "test_metadata": { "request_hash": "6e0c7b85890a", "trace_id": "trace-f5d01234" }, "history": { "latency_trend_7d": [ 652, 650, 648, 649, 651, 650, 1401 ], "throughput_trend_7d": [ 41.9, 42.1, 42.0, 42.2, 42.0, 42.0, 16.42 ] } }, { "provider": "Claude", "model": "Claude Sonnet 4", "metrics": { "latency_ms": 2471, "throughput_tokens_per_sec": 12.55 }, "tokens_generated": 31, "elapsed_seconds": 2.471, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.003, "estimated_cost_usd": 9.3e-05 }, "model_metadata": { "release_date": "2024-10-22", "context_length": 200000, "hardware": "GPU", "api_type": "Anthropic-native" }, "infra": { "schema_url": "https://docs.anthropic.com/en/api/messages", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.anthropic.com", "api_docs": "https://docs.anthropic.com/en/api" }, "test_metadata": { "request_hash": "bb1a07893be0", "trace_id": "trace-b646eda2" }, "history": { "latency_trend_7d": [ 1152, 1150, 1148, 1149, 1151, 1150, 2471 ], "throughput_trend_7d": [ 21.9, 22.1, 22.0, 22.2, 22.0, 22.0, 12.55 ] } } ], "by_throughput": [ { "provider": "HF GPT OSS 120B (Cerebras)", "model": "GPT OSS 120B", "metrics": { "latency_ms": 412, "throughput_tokens_per_sec": 121.32 }, "tokens_generated": 50, "elapsed_seconds": 0.412, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0008, "estimated_cost_usd": 4e-05 }, "model_metadata": { "release_date": "2025-01-07", "context_length": 128000, "hardware": "Cerebras WSE", "api_type": "OpenAI-compatible" } }, { "provider": "Together AI", "model": "Llama3.1-8B-Turbo", "metrics": { "latency_ms": 361, "throughput_tokens_per_sec": 88.65 }, "tokens_generated": 32, "elapsed_seconds": 0.361, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-01-15", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://docs.together.ai/docs/inference-models", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.together.ai", "api_docs": "https://docs.together.ai/reference/chat-completions" }, "test_metadata": { "request_hash": "c35de0059f8b", "trace_id": "trace-4fa3ad56" }, "history": { "latency_trend_7d": [ 120, 118, 116, 117, 119, 118, 361 ], "throughput_trend_7d": [ 145.9, 146.1, 146.0, 146.2, 146.0, 146.0, 88.65 ] } }, { "provider": "Fireworks AI", "model": "Llama3.1-8B", "metrics": { "latency_ms": 440, "throughput_tokens_per_sec": 72.72 }, "tokens_generated": 32, "elapsed_seconds": 0.44, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0002, "estimated_cost_usd": 6e-06 }, "model_metadata": { "release_date": "2024-04-15", "context_length": 8192, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://readme.fireworks.ai/reference/createchatcompletion", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://fireworks.ai/", "api_docs": "https://readme.fireworks.ai/docs" }, "test_metadata": { "request_hash": "f8bbfc248c7e", "trace_id": "trace-5a6f10ee" }, "history": { "latency_trend_7d": [ 180, 178, 176, 177, 179, 178, 440 ], "throughput_trend_7d": [ 135.9, 136.1, 136.0, 136.2, 136.0, 136.0, 72.72 ] } }, { "provider": "Google Gemini", "model": "Gemini-2.0-Flash", "metrics": { "latency_ms": 583, "throughput_tokens_per_sec": 18.88 }, "tokens_generated": 11, "elapsed_seconds": 0.583, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00075, "estimated_cost_usd": 8e-06 }, "model_metadata": { "release_date": "2024-12-11", "context_length": 2000000, "hardware": "TPU", "api_type": "Google-native" }, "infra": { "schema_url": "https://ai.google.dev/api", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://ai.google.dev/docs", "api_docs": "https://ai.google.dev/api/generate-content" }, "test_metadata": { "request_hash": "10d1eaa1ac9d", "trace_id": "trace-1ecf93d9" }, "history": { "latency_trend_7d": [ 352, 350, 348, 349, 351, 350, 583 ], "throughput_trend_7d": [ 91.9, 92.1, 92.0, 92.2, 92.0, 92.0, 18.88 ] } }, { "provider": "OpenRouter", "model": "Mistral", "metrics": { "latency_ms": 1401, "throughput_tokens_per_sec": 16.42 }, "tokens_generated": 23, "elapsed_seconds": 1.401, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.00018, "estimated_cost_usd": 4e-06 }, "model_metadata": { "release_date": "2024-02-26", "context_length": 32768, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://openrouter.ai/docs", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://openrouter.ai/docs", "api_docs": "https://openrouter.ai/docs/api" }, "test_metadata": { "request_hash": "6e0c7b85890a", "trace_id": "trace-f5d01234" }, "history": { "latency_trend_7d": [ 652, 650, 648, 649, 651, 650, 1401 ], "throughput_trend_7d": [ 41.9, 42.1, 42.0, 42.2, 42.0, 42.0, 16.42 ] } }, { "provider": "OpenAI", "model": "GPT-4o", "metrics": { "latency_ms": 1095, "throughput_tokens_per_sec": 15.53 }, "tokens_generated": 17, "elapsed_seconds": 1.095, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.0025, "estimated_cost_usd": 4.3e-05 }, "model_metadata": { "release_date": "2024-05-13", "context_length": 128000, "hardware": "GPU", "api_type": "OpenAI-compatible" }, "infra": { "schema_url": "https://platform.openai.com/docs/api-reference", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://platform.openai.com/docs", "api_docs": "https://platform.openai.com/docs/api-reference/chat" }, "test_metadata": { "request_hash": "dc2b6d757486", "trace_id": "trace-e1a9b6b4" }, "history": { "latency_trend_7d": [ 752, 750, 748, 749, 751, 750, 1095 ], "throughput_trend_7d": [ 31.9, 32.1, 32.0, 32.2, 32.0, 32.0, 15.53 ] } }, { "provider": "Claude", "model": "Claude Sonnet 4", "metrics": { "latency_ms": 2471, "throughput_tokens_per_sec": 12.55 }, "tokens_generated": 31, "elapsed_seconds": 2.471, "health": { "cold_start_latency_ms": null, "warm_start_latency_ms": 750, "availability_percent": 100.0, "error_rate_percent": 0.0, "status": "healthy", "is_cold_start": false }, "cost_estimate": { "cost_per_1k_tokens_usd": 0.003, "estimated_cost_usd": 9.3e-05 }, "model_metadata": { "release_date": "2024-10-22", "context_length": 200000, "hardware": "GPU", "api_type": "Anthropic-native" }, "infra": { "schema_url": "https://docs.anthropic.com/en/api/messages", "plugin_manifest": "/.well-known/ai-plugin.json", "human_readable_url": "https://docs.anthropic.com", "api_docs": "https://docs.anthropic.com/en/api" }, "test_metadata": { "request_hash": "bb1a07893be0", "trace_id": "trace-b646eda2" }, "history": { "latency_trend_7d": [ 1152, 1150, 1148, 1149, 1151, 1150, 2471 ], "throughput_trend_7d": [ 21.9, 22.1, 22.0, 22.2, 22.0, 22.0, 12.55 ] } } ] }, "fastest_latency": "Together AI", "highest_throughput": "HF GPT OSS 120B (Cerebras)", "total_tested": 8, "successful_tests": 7, "failed_tests": 1, "performance_summary": { "best_latency_ms": 361, "best_throughput_tokens_per_sec": 121.32, "avg_latency_ms": 966, "avg_throughput_tokens_per_sec": 49.44 }, "ai_guidance": { "best_for_speed": "Together AI", "best_for_throughput": "HF GPT OSS 120B (Cerebras)", "recommendation": "Use Together AI for lowest latency, HF GPT OSS 120B (Cerebras) for highest throughput", "use_case_guidance": { "real_time_chat": "Recommended: Together AI (lowest latency)", "bulk_generation": "Recommended: HF GPT OSS 120B (Cerebras) (highest throughput)", "balanced_workload": "Consider both Together AI and HF GPT OSS 120B (Cerebras)" } }, "human_readable_summary": "\u26a1 Fastest: Together AI (361ms - Good) | \ud83d\ude80 Highest throughput: HF GPT OSS 120B (Cerebras) (121.3 tokens/sec - High) | \u2705 7/8 providers responding", "timestamp": "2025-09-07T23:20:18.229087Z" }