{
"service": "InferenceLatency.com",
"endpoint": "throughput",
"description": "Combined latency and throughput benchmarking",
"providers": [
{
"provider": "OpenAI",
"model": "GPT-4o",
"metrics": {
"latency_ms": 1173,
"throughput_tokens_per_sec": 14.49
},
"tokens_generated": 17,
"elapsed_seconds": 1.173,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0025,
"estimated_cost_usd": 4.3e-05
},
"model_metadata": {
"release_date": "2024-05-13",
"context_length": 128000,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://platform.openai.com/docs/api-reference",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.openai.com/docs",
"api_docs": "https://platform.openai.com/docs/api-reference/chat"
},
"test_metadata": {
"request_hash": "126c4dab7b56",
"trace_id": "trace-4f8e2659"
},
"history": {
"latency_trend_7d": [
752,
750,
748,
749,
751,
750,
1173
],
"throughput_trend_7d": [
31.9,
32.1,
32.0,
32.2,
32.0,
32.0,
14.49
]
}
},
{
"provider": "Groq",
"model": "Llama3-8B",
"metrics": {
"latency_ms": null,
"throughput_tokens_per_sec": null
},
"error": "Client error '400 Bad Request' for url 'https://api.groq.com/openai/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 0.0,
"error_rate_percent": 100.0,
"status": "unhealthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00027,
"estimated_cost_usd": 0.0
},
"model_metadata": {
"release_date": "2024-04-18",
"context_length": 8192,
"hardware": "LPU",
"api_type": "OpenAI-compatible",
"schema_url": "https://console.groq.com/docs/openai",
"human_readable_url": "https://console.groq.com/docs",
"api_docs": "https://console.groq.com/docs/api-reference"
},
"infra": {
"schema_url": "https://console.groq.com/docs/openai",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://console.groq.com/docs",
"api_docs": "https://console.groq.com/docs/api-reference"
},
"test_metadata": {
"request_hash": "d739ea0c736f",
"trace_id": "trace-c1ef7e3a"
},
"history": {
"latency_trend_7d": [
952,
950,
948,
949,
951,
950,
950
],
"throughput_trend_7d": [
121.9,
122.1,
122.0,
122.2,
122.0,
122.0,
122.01
]
}
},
{
"provider": "Claude",
"model": "Claude Sonnet 4",
"metrics": {
"latency_ms": 2414,
"throughput_tokens_per_sec": 12.84
},
"tokens_generated": 31,
"elapsed_seconds": 2.414,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.003,
"estimated_cost_usd": 9.3e-05
},
"model_metadata": {
"release_date": "2024-10-22",
"context_length": 200000,
"hardware": "GPU",
"api_type": "Anthropic-native"
},
"infra": {
"schema_url": "https://docs.anthropic.com/en/api/messages",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.anthropic.com",
"api_docs": "https://docs.anthropic.com/en/api"
},
"test_metadata": {
"request_hash": "d480d15bd31b",
"trace_id": "trace-7ebb82bf"
},
"history": {
"latency_trend_7d": [
1152,
1150,
1148,
1149,
1151,
1150,
2414
],
"throughput_trend_7d": [
21.9,
22.1,
22.0,
22.2,
22.0,
22.0,
12.84
]
}
},
{
"provider": "OpenRouter",
"model": "Mistral",
"metrics": {
"latency_ms": 707,
"throughput_tokens_per_sec": 29.69
},
"tokens_generated": 21,
"elapsed_seconds": 0.707,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00018,
"estimated_cost_usd": 4e-06
},
"model_metadata": {
"release_date": "2024-02-26",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://openrouter.ai/docs",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://openrouter.ai/docs",
"api_docs": "https://openrouter.ai/docs/api"
},
"test_metadata": {
"request_hash": "10d6566d0f98",
"trace_id": "trace-26568474"
},
"history": {
"latency_trend_7d": [
652,
650,
648,
649,
651,
650,
707
],
"throughput_trend_7d": [
41.9,
42.1,
42.0,
42.2,
42.0,
42.0,
29.69
]
}
},
{
"provider": "Google Gemini",
"model": "Gemini-2.0-Flash",
"metrics": {
"latency_ms": null,
"throughput_tokens_per_sec": null
},
"error": "429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\\nPlease retry in 55.092339557s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-exp'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.0-flash-exp', 'location': 'global'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-exp'}}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '55s'}]}}",
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 0.0,
"error_rate_percent": 100.0,
"status": "unhealthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00075,
"estimated_cost_usd": 0.0
},
"model_metadata": {
"release_date": "2024-12-11",
"context_length": 2000000,
"hardware": "TPU",
"api_type": "Google-native",
"schema_url": "https://ai.google.dev/api",
"human_readable_url": "https://ai.google.dev/docs",
"api_docs": "https://ai.google.dev/api/generate-content"
},
"infra": {
"schema_url": "https://ai.google.dev/api",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://ai.google.dev/docs",
"api_docs": "https://ai.google.dev/api/generate-content"
},
"test_metadata": {
"request_hash": "94590d1b6c3d",
"trace_id": "trace-a8d6f6fc"
},
"history": {
"latency_trend_7d": [
352,
350,
348,
349,
351,
350,
350
],
"throughput_trend_7d": [
91.9,
92.1,
92.0,
92.2,
92.0,
92.0,
92.01
]
}
},
{
"provider": "Together AI",
"model": "Llama3.1-8B-Turbo",
"metrics": {
"latency_ms": 340,
"throughput_tokens_per_sec": 91.08
},
"tokens_generated": 31,
"elapsed_seconds": 0.34,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0002,
"estimated_cost_usd": 6e-06
},
"model_metadata": {
"release_date": "2024-01-15",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://docs.together.ai/docs/inference-models",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.together.ai",
"api_docs": "https://docs.together.ai/reference/chat-completions"
},
"test_metadata": {
"request_hash": "fb68f5c0eeec",
"trace_id": "trace-03420370"
},
"history": {
"latency_trend_7d": [
120,
118,
116,
117,
119,
118,
340
],
"throughput_trend_7d": [
145.9,
146.1,
146.0,
146.2,
146.0,
146.0,
91.08
]
}
},
{
"provider": "Fireworks AI",
"model": "Llama3.1-8B",
"metrics": {
"latency_ms": null,
"throughput_tokens_per_sec": null
},
"error": "Client error '404 Not Found' for url 'https://api.fireworks.ai/inference/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 0.0,
"error_rate_percent": 100.0,
"status": "unhealthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0002,
"estimated_cost_usd": 0.0
},
"model_metadata": {
"release_date": "2024-04-15",
"context_length": 8192,
"hardware": "GPU",
"api_type": "OpenAI-compatible",
"schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
"human_readable_url": "https://fireworks.ai/",
"api_docs": "https://readme.fireworks.ai/docs"
},
"infra": {
"schema_url": "https://readme.fireworks.ai/reference/createchatcompletion",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://fireworks.ai/",
"api_docs": "https://readme.fireworks.ai/docs"
},
"test_metadata": {
"request_hash": "8d13b5b3b4f6",
"trace_id": "trace-e30903f9"
},
"history": {
"latency_trend_7d": [
180,
178,
176,
177,
179,
178,
178
],
"throughput_trend_7d": [
135.9,
136.1,
136.0,
136.2,
136.0,
136.0,
136.01
]
}
},
{
"provider": "HF GPT OSS 120B (Cerebras)",
"model": "GPT OSS 120B",
"metrics": {
"latency_ms": null,
"throughput_tokens_per_sec": null
},
"error": "Client error '401 Unauthorized' for url 'https://router.huggingface.co/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401",
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 0.0,
"error_rate_percent": 100.0,
"status": "unhealthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0008,
"estimated_cost_usd": 0.0
},
"model_metadata": {
"release_date": "2025-01-07",
"context_length": 128000,
"hardware": "Cerebras WSE",
"api_type": "OpenAI-compatible",
"schema_url": "https://huggingface.co/docs/inference-providers",
"human_readable_url": "https://huggingface.co/openai/gpt-oss-120b",
"api_docs": "https://huggingface.co/docs/inference-providers/en/guides/gpt-oss"
}
},
{
"provider": "DeepSeek",
"model": "deepseek-chat",
"metrics": {
"latency_ms": 1752,
"throughput_tokens_per_sec": 9.71
},
"tokens_generated": 17,
"elapsed_seconds": 1.752,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00027,
"estimated_cost_usd": 5e-06
},
"model_metadata": {
"release_date": "2024-12-01",
"context_length": 64000,
"hardware": "GPU",
"api_type": "OpenAI-compatible",
"schema_url": "https://api-docs.deepseek.com/",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"infra": {
"schema_url": "https://api-docs.deepseek.com/",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"test_metadata": {
"request_hash": "f29beaebe905",
"trace_id": "trace-a44b493a"
},
"history": {
"latency_trend_7d": [
280,
278,
276,
277,
279,
278,
278
],
"throughput_trend_7d": [
85.9,
86.1,
86.0,
86.2,
86.0,
86.0,
86.01
]
}
},
{
"provider": "Cohere",
"model": "command-r",
"metrics": {
"latency_ms": null,
"throughput_tokens_per_sec": null
},
"error": "Client error '404 Not Found' for url 'https://api.cohere.ai/v1/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 0.0,
"error_rate_percent": 100.0,
"status": "unhealthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00075,
"estimated_cost_usd": 0.0
},
"model_metadata": {
"release_date": "2024-10-01",
"context_length": 128000,
"hardware": "GPU",
"api_type": "Cohere-native",
"schema_url": "https://docs.cohere.com/reference/chat",
"human_readable_url": "https://docs.cohere.com/",
"api_docs": "https://docs.cohere.com/reference/chat"
},
"infra": {
"schema_url": "https://docs.cohere.com/reference/chat",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.cohere.com/",
"api_docs": "https://docs.cohere.com/reference/chat"
},
"test_metadata": {
"request_hash": "cd16c98f23fa",
"trace_id": "trace-d64ef776"
},
"history": {
"latency_trend_7d": [
320,
318,
316,
317,
319,
318,
318
],
"throughput_trend_7d": [
45.9,
46.1,
46.0,
46.2,
46.0,
46.0,
46.01
]
}
}
],
"rankings": {
"by_latency": [
{
"provider": "Together AI",
"model": "Llama3.1-8B-Turbo",
"metrics": {
"latency_ms": 340,
"throughput_tokens_per_sec": 91.08
},
"tokens_generated": 31,
"elapsed_seconds": 0.34,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0002,
"estimated_cost_usd": 6e-06
},
"model_metadata": {
"release_date": "2024-01-15",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://docs.together.ai/docs/inference-models",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.together.ai",
"api_docs": "https://docs.together.ai/reference/chat-completions"
},
"test_metadata": {
"request_hash": "fb68f5c0eeec",
"trace_id": "trace-03420370"
},
"history": {
"latency_trend_7d": [
120,
118,
116,
117,
119,
118,
340
],
"throughput_trend_7d": [
145.9,
146.1,
146.0,
146.2,
146.0,
146.0,
91.08
]
}
},
{
"provider": "OpenRouter",
"model": "Mistral",
"metrics": {
"latency_ms": 707,
"throughput_tokens_per_sec": 29.69
},
"tokens_generated": 21,
"elapsed_seconds": 0.707,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00018,
"estimated_cost_usd": 4e-06
},
"model_metadata": {
"release_date": "2024-02-26",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://openrouter.ai/docs",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://openrouter.ai/docs",
"api_docs": "https://openrouter.ai/docs/api"
},
"test_metadata": {
"request_hash": "10d6566d0f98",
"trace_id": "trace-26568474"
},
"history": {
"latency_trend_7d": [
652,
650,
648,
649,
651,
650,
707
],
"throughput_trend_7d": [
41.9,
42.1,
42.0,
42.2,
42.0,
42.0,
29.69
]
}
},
{
"provider": "OpenAI",
"model": "GPT-4o",
"metrics": {
"latency_ms": 1173,
"throughput_tokens_per_sec": 14.49
},
"tokens_generated": 17,
"elapsed_seconds": 1.173,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0025,
"estimated_cost_usd": 4.3e-05
},
"model_metadata": {
"release_date": "2024-05-13",
"context_length": 128000,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://platform.openai.com/docs/api-reference",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.openai.com/docs",
"api_docs": "https://platform.openai.com/docs/api-reference/chat"
},
"test_metadata": {
"request_hash": "126c4dab7b56",
"trace_id": "trace-4f8e2659"
},
"history": {
"latency_trend_7d": [
752,
750,
748,
749,
751,
750,
1173
],
"throughput_trend_7d": [
31.9,
32.1,
32.0,
32.2,
32.0,
32.0,
14.49
]
}
},
{
"provider": "DeepSeek",
"model": "deepseek-chat",
"metrics": {
"latency_ms": 1752,
"throughput_tokens_per_sec": 9.71
},
"tokens_generated": 17,
"elapsed_seconds": 1.752,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00027,
"estimated_cost_usd": 5e-06
},
"model_metadata": {
"release_date": "2024-12-01",
"context_length": 64000,
"hardware": "GPU",
"api_type": "OpenAI-compatible",
"schema_url": "https://api-docs.deepseek.com/",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"infra": {
"schema_url": "https://api-docs.deepseek.com/",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"test_metadata": {
"request_hash": "f29beaebe905",
"trace_id": "trace-a44b493a"
},
"history": {
"latency_trend_7d": [
280,
278,
276,
277,
279,
278,
278
],
"throughput_trend_7d": [
85.9,
86.1,
86.0,
86.2,
86.0,
86.0,
86.01
]
}
},
{
"provider": "Claude",
"model": "Claude Sonnet 4",
"metrics": {
"latency_ms": 2414,
"throughput_tokens_per_sec": 12.84
},
"tokens_generated": 31,
"elapsed_seconds": 2.414,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.003,
"estimated_cost_usd": 9.3e-05
},
"model_metadata": {
"release_date": "2024-10-22",
"context_length": 200000,
"hardware": "GPU",
"api_type": "Anthropic-native"
},
"infra": {
"schema_url": "https://docs.anthropic.com/en/api/messages",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.anthropic.com",
"api_docs": "https://docs.anthropic.com/en/api"
},
"test_metadata": {
"request_hash": "d480d15bd31b",
"trace_id": "trace-7ebb82bf"
},
"history": {
"latency_trend_7d": [
1152,
1150,
1148,
1149,
1151,
1150,
2414
],
"throughput_trend_7d": [
21.9,
22.1,
22.0,
22.2,
22.0,
22.0,
12.84
]
}
}
],
"by_throughput": [
{
"provider": "Together AI",
"model": "Llama3.1-8B-Turbo",
"metrics": {
"latency_ms": 340,
"throughput_tokens_per_sec": 91.08
},
"tokens_generated": 31,
"elapsed_seconds": 0.34,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0002,
"estimated_cost_usd": 6e-06
},
"model_metadata": {
"release_date": "2024-01-15",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://docs.together.ai/docs/inference-models",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.together.ai",
"api_docs": "https://docs.together.ai/reference/chat-completions"
},
"test_metadata": {
"request_hash": "fb68f5c0eeec",
"trace_id": "trace-03420370"
},
"history": {
"latency_trend_7d": [
120,
118,
116,
117,
119,
118,
340
],
"throughput_trend_7d": [
145.9,
146.1,
146.0,
146.2,
146.0,
146.0,
91.08
]
}
},
{
"provider": "OpenRouter",
"model": "Mistral",
"metrics": {
"latency_ms": 707,
"throughput_tokens_per_sec": 29.69
},
"tokens_generated": 21,
"elapsed_seconds": 0.707,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00018,
"estimated_cost_usd": 4e-06
},
"model_metadata": {
"release_date": "2024-02-26",
"context_length": 32768,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://openrouter.ai/docs",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://openrouter.ai/docs",
"api_docs": "https://openrouter.ai/docs/api"
},
"test_metadata": {
"request_hash": "10d6566d0f98",
"trace_id": "trace-26568474"
},
"history": {
"latency_trend_7d": [
652,
650,
648,
649,
651,
650,
707
],
"throughput_trend_7d": [
41.9,
42.1,
42.0,
42.2,
42.0,
42.0,
29.69
]
}
},
{
"provider": "OpenAI",
"model": "GPT-4o",
"metrics": {
"latency_ms": 1173,
"throughput_tokens_per_sec": 14.49
},
"tokens_generated": 17,
"elapsed_seconds": 1.173,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.0025,
"estimated_cost_usd": 4.3e-05
},
"model_metadata": {
"release_date": "2024-05-13",
"context_length": 128000,
"hardware": "GPU",
"api_type": "OpenAI-compatible"
},
"infra": {
"schema_url": "https://platform.openai.com/docs/api-reference",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.openai.com/docs",
"api_docs": "https://platform.openai.com/docs/api-reference/chat"
},
"test_metadata": {
"request_hash": "126c4dab7b56",
"trace_id": "trace-4f8e2659"
},
"history": {
"latency_trend_7d": [
752,
750,
748,
749,
751,
750,
1173
],
"throughput_trend_7d": [
31.9,
32.1,
32.0,
32.2,
32.0,
32.0,
14.49
]
}
},
{
"provider": "Claude",
"model": "Claude Sonnet 4",
"metrics": {
"latency_ms": 2414,
"throughput_tokens_per_sec": 12.84
},
"tokens_generated": 31,
"elapsed_seconds": 2.414,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.003,
"estimated_cost_usd": 9.3e-05
},
"model_metadata": {
"release_date": "2024-10-22",
"context_length": 200000,
"hardware": "GPU",
"api_type": "Anthropic-native"
},
"infra": {
"schema_url": "https://docs.anthropic.com/en/api/messages",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://docs.anthropic.com",
"api_docs": "https://docs.anthropic.com/en/api"
},
"test_metadata": {
"request_hash": "d480d15bd31b",
"trace_id": "trace-7ebb82bf"
},
"history": {
"latency_trend_7d": [
1152,
1150,
1148,
1149,
1151,
1150,
2414
],
"throughput_trend_7d": [
21.9,
22.1,
22.0,
22.2,
22.0,
22.0,
12.84
]
}
},
{
"provider": "DeepSeek",
"model": "deepseek-chat",
"metrics": {
"latency_ms": 1752,
"throughput_tokens_per_sec": 9.71
},
"tokens_generated": 17,
"elapsed_seconds": 1.752,
"health": {
"cold_start_latency_ms": null,
"warm_start_latency_ms": 750,
"availability_percent": 100.0,
"error_rate_percent": 0.0,
"status": "healthy",
"is_cold_start": false
},
"cost_estimate": {
"cost_per_1k_tokens_usd": 0.00027,
"estimated_cost_usd": 5e-06
},
"model_metadata": {
"release_date": "2024-12-01",
"context_length": 64000,
"hardware": "GPU",
"api_type": "OpenAI-compatible",
"schema_url": "https://api-docs.deepseek.com/",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"infra": {
"schema_url": "https://api-docs.deepseek.com/",
"plugin_manifest": "/.well-known/ai-plugin.json",
"human_readable_url": "https://platform.deepseek.com/",
"api_docs": "https://api-docs.deepseek.com/api/create-chat-completion"
},
"test_metadata": {
"request_hash": "f29beaebe905",
"trace_id": "trace-a44b493a"
},
"history": {
"latency_trend_7d": [
280,
278,
276,
277,
279,
278,
278
],
"throughput_trend_7d": [
85.9,
86.1,
86.0,
86.2,
86.0,
86.0,
86.01
]
}
}
]
},
"fastest_latency": "Together AI",
"highest_throughput": "Together AI",
"total_tested": 10,
"successful_tests": 5,
"failed_tests": 5,
"performance_summary": {
"best_latency_ms": 340,
"best_throughput_tokens_per_sec": 91.08,
"avg_latency_ms": 1277,
"avg_throughput_tokens_per_sec": 31.56
},
"ai_guidance": {
"best_for_speed": "Together AI",
"best_for_throughput": "Together AI",
"recommendation": "Use Together AI for lowest latency, Together AI for highest throughput",
"use_case_guidance": {
"real_time_chat": "Recommended: Together AI (lowest latency)",
"bulk_generation": "Recommended: Together AI (highest throughput)",
"balanced_workload": "Consider both Together AI and Together AI"
}
},
"human_readable_summary": "\u26a1 Fastest: Together AI (340ms - Good) | \ud83d\ude80 Highest throughput: Together AI (91.1 tokens/sec - High) | \u2705 5/10 providers responding",
"timestamp": "2026-01-15T05:04:06.694474Z"
}