{
  "@context": "https://schema.org",
  "@type": "Dataset",
  "@id": "https://requesty.ai/data/provider-latency-yoy-april-2026",
  "id": "latency-yoy-april-2026",
  "slug": "provider-latency-yoy-april-2026",
  "title": "p50 latency YoY: April 2025 vs April 2026",
  "shortTitle": "p50 latency YoY",
  "topic": "latency",
  "abstract": "Has LLM latency improved over the past year? On the Requesty gateway, open-source aggregator routes compressed dramatically between April 2025 and April 2026. xAI fell 93% (9.1 s to 0.6 s), DeepInfra 91% (15.8 s to 1.4 s), DeepSeek 62% (24.3 s to 9.2 s). Frontier providers barely moved (OpenAI -5%, Anthropic 0%). Vertex (Claude) is the only major route that got slower, +131%, as heavy agentic Claude Code workloads landed on it.",
  "whyItMatters": "The OSS-aggregator tier closed most of the latency gap to frontier providers in 12 months: routing easy work onto a cheap OSS path used to cost 5-25 seconds and now costs sub-second. Workload composition is the dominant force on aggregate latency. Vertex (Claude) getting 2.3× slower while the underlying inference stack barely changed shows that \"is provider X fast?\" is the wrong question to ask in isolation.",
  "questions": [
    "How has LLM latency changed from 2025 to 2026?",
    "Are open-source LLMs as fast as OpenAI now?",
    "Which AI providers got faster in 2026?",
    "Why are some LLM routes getting slower year-over-year?"
  ],
  "period": "Apr 2025  to  Apr 2026",
  "updated": "2026-05-09",
  "license": "CC BY 4.0",
  "licenseUrl": "https://creativecommons.org/licenses/by/4.0/",
  "caveats": [
    "Vertex (Gemini) had no meaningful 2025 traffic so it is not in this chart. Only Vertex (Claude) is YoY-comparable.",
    "Vertex (Claude) Apr 2025 sample is small and the workload that lived on it has changed substantially, so the +131% delta is more about workload mix than a true latency regression.",
    "Customer-base composition changed YoY, so the workload mix hitting these providers is different. Latency YoY is robust to this because it is wall-clock duration not affected by the request mix in aggregate, but interpret it as \"providers behave differently AND the work has shifted\", not as a controlled experiment.",
    "The `successful` flag semantics may have changed between 2025 and 2026, but quantiles over wall-clock duration are not affected."
  ],
  "keyFindings": [
    "OSS aggregator routes (xAI, DeepInfra, Alibaba, Novita, Nebius) compressed 89-93% YoY.",
    "xAI: 9.1 s  to  0.6 s (-93%). DeepInfra: 15.8 s  to  1.4 s (-91%).",
    "DeepSeek: 24.3 s  to  9.2 s (-62%). Still slow but dramatically faster.",
    "Frontier providers barely moved: OpenAI -5%, Anthropic 0%.",
    "Vertex (Claude) is the lone exception: 6.0 s  to  13.8 s (+131%). The route stayed put while heavy agentic Claude Code workloads moved onto it, so the work itself got bigger.",
    "Practical implication: routing easy work to a cheap OSS path used to cost 5-25 seconds, now costs sub-second."
  ],
  "columns": [
    {
      "key": "provider",
      "label": "Provider",
      "unit": "count"
    },
    {
      "key": "prior_ms",
      "label": "Apr 2025 p50",
      "unit": "milliseconds"
    },
    {
      "key": "current_ms",
      "label": "Apr 2026 p50",
      "unit": "milliseconds"
    },
    {
      "key": "delta_pct",
      "label": "YoY delta",
      "unit": "percent"
    }
  ],
  "rows": [
    {
      "provider": "xAI",
      "prior_ms": 9100,
      "current_ms": 600,
      "delta_pct": -0.93
    },
    {
      "provider": "DeepInfra",
      "prior_ms": 15800,
      "current_ms": 1400,
      "delta_pct": -0.91
    },
    {
      "provider": "Alibaba",
      "prior_ms": 5800,
      "current_ms": 500,
      "delta_pct": -0.91
    },
    {
      "provider": "Novita",
      "prior_ms": 8800,
      "current_ms": 800,
      "delta_pct": -0.91
    },
    {
      "provider": "Nebius",
      "prior_ms": 22100,
      "current_ms": 2300,
      "delta_pct": -0.89
    },
    {
      "provider": "DeepSeek",
      "prior_ms": 24300,
      "current_ms": 9200,
      "delta_pct": -0.62
    },
    {
      "provider": "Coding",
      "prior_ms": 7900,
      "current_ms": 6100,
      "delta_pct": -0.23
    },
    {
      "provider": "OpenAI",
      "prior_ms": 2600,
      "current_ms": 2500,
      "delta_pct": -0.05
    },
    {
      "provider": "Anthropic",
      "prior_ms": 5900,
      "current_ms": 5900,
      "delta_pct": 0
    },
    {
      "provider": "Vertex (Claude)",
      "prior_ms": 6000,
      "current_ms": 13800,
      "delta_pct": 1.31
    }
  ],
  "rowKey": "provider",
  "citation": {
    "apa": "Requesty (2026). p50 latency YoY: April 2025 vs April 2026. Requesty Data. https://requesty.ai/data/provider-latency-yoy-april-2026",
    "bibtex": "@misc{requesty_provider_latency_yoy_april_2026,\n  author       = {{Requesty}},\n  title        = {p50 latency YoY: April 2025 vs April 2026},\n  year         = {2026},\n  howpublished = {\\url{https://requesty.ai/data/provider-latency-yoy-april-2026}},\n  note         = {Requesty Data}\n}"
  },
  "permalink": "https://requesty.ai/data/provider-latency-yoy-april-2026",
  "downloads": {
    "json": "https://requesty.ai/data/provider-latency-yoy-april-2026/data.json",
    "csv": "https://requesty.ai/data/provider-latency-yoy-april-2026/data.csv",
    "markdown": "https://requesty.ai/data/provider-latency-yoy-april-2026.md"
  },
  "citedIn": [
    {
      "title": "What the gateway saw in April 2026",
      "url": "https://requesty.ai/blog/provider-trends-april-2026-agentic-share-latency"
    }
  ],
  "image": "https://requesty.ai/data/provider-latency-yoy-april-2026/opengraph-image",
  "source": {
    "organization": "Requesty",
    "url": "https://requesty.ai"
  }
}