{"object":"list","data":[{"id":"google/gemini-1.5-flash-8b","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemini-1.5-flash-8b","parent":null,"metadata":{"description":"","context_length":1000000,"max_tokens":1000000,"pricing":{"input_tokens":0.0375,"output_tokens":0.15},"tags":["vision","reasoning_effort"]}},{"id":"Qwen/Qwen3.5-27B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-27B","parent":null,"metadata":{"description":"Qwen3.5-27B is Alibaba's largest dense Qwen3.5 model, delivering near-frontier quality across reasoning, coding, and instruction following. It features a 262K token context window (extensible to 1M), thinking/reasoning mode, tool calling, multi-token prediction, and support for 201 languages. Best suited for production deployments and complex enterprise tasks requiring top-tier performance.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.26,"output_tokens":2.5999999999999996},"tags":["vision","reasoning_effort"]}},{"id":"deepseek-ai/DeepSeek-R1-0528","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-R1-0528","parent":null,"metadata":{"description":"The DeepSeek R1 model has undergone a minor version upgrade, with the current version being DeepSeek-R1-0528.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.5,"output_tokens":2.15,"cache_read_tokens":0.35},"tags":["prompt_cache","reasoning"]}},{"id":"meta-llama/Llama-3.2-11B-Vision-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Llama-3.2-11B-Vision-Instruct","parent":null,"metadata":{"description":"Llama 3.2 11B Vision is a multimodal model with 11 billion parameters, designed to handle tasks combining visual and textual data. It excels in tasks such as image captioning and visual question answering, bridging the gap between language generation and visual reasoning. Pre-trained on a massive dataset of image-text pairs, it performs well in complex, high-accuracy image analysis.  Its ability to integrate visual understanding with language processing makes it an ideal solution for industries requiring comprehensive visual-linguistic AI applications, such as content creation, AI-driven customer service, and research.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.048999999999999995,"output_tokens":0.048999999999999995},"tags":["vision"]}},{"id":"allenai/olmOCR-2-7B-1025","object":"model","created":0,"owned_by":"deepinfra","root":"allenai/olmOCR-2-7B-1025","parent":null,"metadata":{"description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","context_length":16384,"max_tokens":16384,"pricing":{"input_tokens":0.09,"output_tokens":0.19},"tags":["vision"]}},{"id":"stepfun-ai/Step-3.5-Flash","object":"model","created":0,"owned_by":"deepinfra","root":"stepfun-ai/Step-3.5-Flash","parent":null,"metadata":{"description":"Step 3.5 Flash is an open-source reasoning model by StepFun with 196B total parameters (11B active) using Mixture of Experts. It features a 256K context window, deep reasoning, tool calling, and agentic capabilities, achieving 97.3 on AIME 2025 and 74.4% on SWE-bench Verified.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.1,"output_tokens":0.3,"cache_read_tokens":0.020000000000000004},"tags":["prompt_cache"]}},{"id":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","object":"model","created":0,"owned_by":"deepinfra","root":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","parent":null,"metadata":{"description":"Mistral-Small-3.2-24B-Instruct is a drop-in upgrade over the 3.1 release, with markedly better instruction following, roughly half the infinite-generation errors, and a more robust function-calling interface—while otherwise matching or slightly improving on all previous text and vision benchmarks.","context_length":128000,"max_tokens":128000,"pricing":{"input_tokens":0.075,"output_tokens":0.2},"tags":["vision"]}},{"id":"Bria/fibo_edit","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/fibo_edit","parent":null,"metadata":null},{"id":"ByteDance/Seedream-4.5","object":"model","created":0,"owned_by":"deepinfra","root":"ByteDance/Seedream-4.5","parent":null,"metadata":null},{"id":"meta-llama/Meta-Llama-3.1-70B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Meta-Llama-3.1-70B-Instruct","parent":null,"metadata":{"description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.4,"output_tokens":0.4},"tags":[]}},{"id":"BAAI/bge-en-icl","object":"model","created":0,"owned_by":"deepinfra","root":"BAAI/bge-en-icl","parent":null,"metadata":null},{"id":"Qwen/Qwen3.5-122B-A10B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-122B-A10B","parent":null,"metadata":{"description":"Qwen3.5-122B-A10B is a large Mixture-of-Experts model from Alibaba's Qwen3.5 series with 122B total parameters and 10B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Excels at complex reasoning, coding, multimodal understanding, and agentic tasks with the efficiency of sparse activation.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.29,"output_tokens":2.9000000000000004},"tags":["vision","reasoning_effort"]}},{"id":"deepseek-ai/DeepSeek-V3","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-V3","parent":null,"metadata":{"description":"DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. ","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.32,"output_tokens":0.8899999999999999},"tags":[]}},{"id":"zai-org/GLM-4.6","object":"model","created":0,"owned_by":"deepinfra","root":"zai-org/GLM-4.6","parent":null,"metadata":{"description":"Compared with GLM-4.5, GLM-4.6 brings several key improvements:  Longer context window: The context window has been expanded from 128K to 200K tokens, enabling the model to handle more complex agentic tasks. Superior coding performance: The model achieves higher scores on code benchmarks and demonstrates better real-world performance in applications such as Claude Code、Cline、Roo Code and Kilo Code, including improvements in generating visually polished front-end pages. Advanced reasoning: GLM-4.6 shows a clear improvement in reasoning performance and supports tool use during inference, leading to stronger overall capability. More capable agents: GLM-4.6 exhibits stronger performance in tool using and search-based agents, and integrates more effectively within agent frameworks. Refined writing: Better aligns with human preferences in style and readability, and performs more naturally in role-playing scenarios.","context_length":202752,"max_tokens":202752,"pricing":{"input_tokens":0.43,"output_tokens":1.74,"cache_read_tokens":0.0799999993},"tags":["prompt_cache","reasoning"]}},{"id":"thenlper/gte-base","object":"model","created":0,"owned_by":"deepinfra","root":"thenlper/gte-base","parent":null,"metadata":null},{"id":"sentence-transformers/all-mpnet-base-v2","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/all-mpnet-base-v2","parent":null,"metadata":null},{"id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Coder-480B-A35B-Instruct","parent":null,"metadata":{"description":"Qwen3-Coder-480B-A35B-Instruct is the Qwen3's most agentic code model, featuring Significant Performance on Agentic Coding, Agentic Browser-Use and other foundational coding tasks, achieving results comparable to Claude Sonnet.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.4,"output_tokens":1.6},"tags":[]}},{"id":"meta-llama/Llama-3.3-70B-Instruct-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Llama-3.3-70B-Instruct-Turbo","parent":null,"metadata":{"description":"Llama 3.3-70B Turbo is a highly optimized version of the Llama 3.3-70B model, utilizing FP8 quantization to deliver significantly faster inference speeds with a minor trade-off in accuracy. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.1,"output_tokens":0.32},"tags":[]}},{"id":"PrunaAI/p-image-Edit","object":"model","created":0,"owned_by":"deepinfra","root":"PrunaAI/p-image-Edit","parent":null,"metadata":null},{"id":"black-forest-labs/FLUX-1.1-pro","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-1.1-pro","parent":null,"metadata":null},{"id":"Qwen/Qwen3-Embedding-0.6B-batch","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-0.6B-batch","parent":null,"metadata":null},{"id":"deepseek-ai/Janus-Pro-7B","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/Janus-Pro-7B","parent":null,"metadata":null},{"id":"google/gemini-2.5-flash","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemini-2.5-flash","parent":null,"metadata":{"description":"Gemini 2.5 Flash is Google's latest thinking model, designed to tackle increasingly complex problems. It's capable of reasoning through their thoughts before responding, resulting in enhanced performance and improved accuracy.  Gemini 2.5 Flash: best for balancing reasoning and speed.","context_length":1000000,"max_tokens":1000000,"pricing":{"input_tokens":0.3,"output_tokens":2.5},"tags":["vision","reasoning_effort","reasoning"]}},{"id":"zai-org/GLM-5","object":"model","created":0,"owned_by":"deepinfra","root":"zai-org/GLM-5","parent":null,"metadata":{"description":"GLM-5 is an advanced, open-source large language model designed for developers tackling the toughest challenges. It excels at long-context reasoning, multi-step tool orchestration, and complex systems engineering, making it the ideal choice for powering sophisticated agents and applications that require high-level cognitive tasks.","context_length":202752,"max_tokens":202752,"pricing":{"input_tokens":0.8,"output_tokens":2.56,"cache_read_tokens":0.16000000000000003},"tags":["prompt_cache","reasoning"]}},{"id":"Wan-AI/Wan2.6-T2I","object":"model","created":0,"owned_by":"deepinfra","root":"Wan-AI/Wan2.6-T2I","parent":null,"metadata":null},{"id":"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo","parent":null,"metadata":{"description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.02,"output_tokens":0.030000000000000002},"tags":[]}},{"id":"google/gemini-2.5-pro","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemini-2.5-pro","parent":null,"metadata":{"description":"Gemini 2.5 Pro is Google's the most advanced thinking model, designed to tackle increasingly complex problems. Gemini 2.5 Pro leads common benchmarks by meaningful margins and showcases strong reasoning and code capabilities.  Gemini 2.5 models are thinking models, capable of reasoning through their thoughts before responding, resulting in enhanced performance and improved accuracy.  The Gemini 2.5 Pro model is now available on DeepInfra.","context_length":1000000,"max_tokens":1000000,"pricing":{"input_tokens":1.25,"output_tokens":10.0},"tags":["vision","reasoning_effort","reasoning"]}},{"id":"Qwen/Qwen3-30B-A3B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-30B-A3B","parent":null,"metadata":{"description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support","context_length":40960,"max_tokens":40960,"pricing":{"input_tokens":0.08,"output_tokens":0.28},"tags":["reasoning_effort","reasoning"]}},{"id":"Sao10K/L3.3-70B-Euryale-v2.3","object":"model","created":0,"owned_by":"deepinfra","root":"Sao10K/L3.3-70B-Euryale-v2.3","parent":null,"metadata":{"description":"L3.3-70B-Euryale-v2.3 is a model focused on creative roleplay from Sao10k","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.85,"output_tokens":0.85},"tags":[]}},{"id":"deepseek-ai/DeepSeek-V3.1-Terminus","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-V3.1-Terminus","parent":null,"metadata":{"description":"DeepSeek-V3.1 Terminus is an update to DeepSeek V3.1 that maintains the model's original capabilities while addressing issues reported by users, including language consistency and agent capabilities, further optimizing the model's performance in coding and search agents. It is a large hybrid reasoning model (671B parameters, 37B active) that supports both thinking and non-thinking modes. It extends the DeepSeek-V3 base with a two-phase long-context training process. Users can control the reasoning behaviour with the reasoning enabled boolean. Learn more in our docs  The model improves tool use, code generation, and reasoning efficiency, achieving performance comparable to DeepSeek-R1 on difficult benchmarks while responding more quickly. It supports structured tool calling, code agents, and search agents, making it suitable for research, coding, and agentic workflows.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.21,"output_tokens":0.7899999999999999,"cache_read_tokens":0.1300000002},"tags":["prompt_cache","reasoning_effort","reasoning"]}},{"id":"Qwen/Qwen3-Next-80B-A3B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Next-80B-A3B-Instruct","parent":null,"metadata":{"description":"Over the past few months, we have observed increasingly clear trends toward scaling both total parameters and context lengths in the pursuit of more powerful and agentic artificial intelligence (AI). We are excited to share our latest advancements in addressing these demands, centered on improving scaling efficiency through innovative model architecture. We call this next-generation foundation models Qwen3-Next.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.09,"output_tokens":1.1},"tags":["reasoning_effort"]}},{"id":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","parent":null,"metadata":{"description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.4,"output_tokens":0.4},"tags":[]}},{"id":"black-forest-labs/FLUX-2-klein-9b","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-2-klein-9b","parent":null,"metadata":null},{"id":"Bria/enhance","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/enhance","parent":null,"metadata":null},{"id":"google/gemma-3-4b-it","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemma-3-4b-it","parent":null,"metadata":{"description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3-12B is Google's latest open source model, successor to Gemma 2","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.04,"output_tokens":0.08},"tags":["vision"]}},{"id":"ByteDance/Seedream-4","object":"model","created":0,"owned_by":"deepinfra","root":"ByteDance/Seedream-4","parent":null,"metadata":null},{"id":"Qwen/Qwen3-235B-A22B-Instruct-2507","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-235B-A22B-Instruct-2507","parent":null,"metadata":{"description":"Qwen3-235B-A22B-Instruct-2507 is the updated version of the Qwen3-235B-A22B non-thinking mode, featuring Significant improvements in general capabilities, including instruction following, logical reasoning, text comprehension, mathematics, science, coding and tool usage.  ","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.071,"output_tokens":0.1},"tags":["reasoning_effort"]}},{"id":"mistralai/Mistral-Nemo-Instruct-2407","object":"model","created":0,"owned_by":"deepinfra","root":"mistralai/Mistral-Nemo-Instruct-2407","parent":null,"metadata":{"description":"12B model trained jointly by Mistral AI and NVIDIA, it significantly outperforms existing models smaller or similar in size.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.02,"output_tokens":0.04},"tags":[]}},{"id":"Sao10K/L3-8B-Lunaris-v1-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"Sao10K/L3-8B-Lunaris-v1-Turbo","parent":null,"metadata":{"description":"","context_length":8192,"max_tokens":8192,"pricing":{"input_tokens":0.04,"output_tokens":0.05},"tags":[]}},{"id":"Qwen/Qwen3.5-397B-A17B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-397B-A17B","parent":null,"metadata":{"description":"Qwen3.5-397B-A17B is Alibaba's most capable Qwen3.5 model, a Mixture-of-Experts architecture with 397B total parameters and 17B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling with MCP integration, and support for 201 languages. Sets state-of-the-art results on reasoning, coding, math, and multimodal benchmarks.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.54,"output_tokens":3.4},"tags":["vision","reasoning_effort"]}},{"id":"Qwen/Qwen3-VL-30B-A3B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-VL-30B-A3B-Instruct","parent":null,"metadata":{"description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.15,"output_tokens":0.6},"tags":["vision","reasoning_effort"]}},{"id":"google/gemma-3-27b-it","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemma-3-27b-it","parent":null,"metadata":{"description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3 27B is Google's latest open source model, successor to Gemma 2","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.08,"output_tokens":0.16},"tags":["vision"]}},{"id":"Wan-AI/Wan2.7-Image-Edit","object":"model","created":0,"owned_by":"deepinfra","root":"Wan-AI/Wan2.7-Image-Edit","parent":null,"metadata":null},{"id":"BAAI/bge-large-en-v1.5","object":"model","created":0,"owned_by":"deepinfra","root":"BAAI/bge-large-en-v1.5","parent":null,"metadata":null},{"id":"deepseek-ai/DeepSeek-R1-0528-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-R1-0528-Turbo","parent":null,"metadata":{"description":"The DeepSeek R1 0528 turbo model is a state of the art reasoning model that can generate very quick responses","context_length":32768,"max_tokens":32768,"pricing":{"input_tokens":1.0,"output_tokens":2.9999999999999996},"tags":["reasoning"]}},{"id":"moonshotai/Kimi-K2.5-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"moonshotai/Kimi-K2.5-Turbo","parent":null,"metadata":{"description":"","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.6,"output_tokens":2.9999999999999996,"cache_read_tokens":0.10000000199999999},"tags":["prompt_cache","reasoning"]}},{"id":"moonshotai/Kimi-K2-Thinking","object":"model","created":0,"owned_by":"deepinfra","root":"moonshotai/Kimi-K2-Thinking","parent":null,"metadata":{"description":"Kimi K2 Thinking is the latest, most capable version of open-source thinking model developed by MoonshotAI","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.47,"output_tokens":2.0,"cache_read_tokens":0.141},"tags":["prompt_cache","reasoning"]}},{"id":"anthropic/claude-4-opus","object":"model","created":0,"owned_by":"deepinfra","root":"anthropic/claude-4-opus","parent":null,"metadata":{"description":"Anthropic’s most powerful model yet and the state-of-the-art coding model. It delivers sustained performance on long-running tasks that require focused effort and thousands of steps, significantly expanding what AI agents can solve. Claude Opus 4 is ideal for powering frontier agent products and features.","context_length":200000,"max_tokens":200000,"pricing":{"input_tokens":16.5,"output_tokens":82.5},"tags":["vision","reasoning_effort"]}},{"id":"Gryphe/MythoMax-L2-13b","object":"model","created":0,"owned_by":"deepinfra","root":"Gryphe/MythoMax-L2-13b","parent":null,"metadata":{"description":"","context_length":4096,"max_tokens":4096,"pricing":{"input_tokens":0.4,"output_tokens":0.4},"tags":[]}},{"id":"deepseek-ai/DeepSeek-OCR","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-OCR","parent":null,"metadata":{"description":"DeepSeek-OCR as an initial investigation into the feasibility of compressing long contexts via optical 2D mapping. DeepSeek-OCR consists of two components: DeepEncoder and DeepSeek3B-MoE-A570M as the decoder. Specifically, DeepEncoder serves as the core engine, designed to maintain low activations under high-resolution input while achieving high compression ratios to ensure an optimal and manageable number of vision tokens. Experiments show that when the number of text tokens is within 10 times that of vision tokens (i.e., a compression ratio < 10x), the model can achieve decoding (OCR) precision of 97%. Even at a compression ratio of 20x, the OCR accuracy still remains at about 60%. This shows considerable promise for research areas such as historical long-context compression and memory forgetting mechanisms in LLMs.","context_length":8192,"max_tokens":8192,"pricing":{"input_tokens":0.030000000000000002,"output_tokens":0.1},"tags":["vision"]}},{"id":"Qwen/Qwen3-Max-Thinking","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Max-Thinking","parent":null,"metadata":{"description":"The latest flagship reasoning model in the Qwen3 family. Further enhanced by multiple innovations like adaptive tool-use and advanced test-time scaling techniques","context_length":256000,"max_tokens":256000,"pricing":{"input_tokens":1.2,"output_tokens":5.999999999999999,"cache_read_tokens":0.24},"tags":["prompt_cache","reasoning_effort"]}},{"id":"thenlper/gte-large","object":"model","created":0,"owned_by":"deepinfra","root":"thenlper/gte-large","parent":null,"metadata":null},{"id":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","parent":null,"metadata":{"description":"The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding. Llama 4 Maverick, a 17 billion parameter model with 128 experts","context_length":1048576,"max_tokens":1048576,"pricing":{"input_tokens":0.15,"output_tokens":0.6},"tags":["vision"]}},{"id":"moonshotai/Kimi-K2-Instruct-0905","object":"model","created":0,"owned_by":"deepinfra","root":"moonshotai/Kimi-K2-Instruct-0905","parent":null,"metadata":{"description":"Kimi K2 0905 is the September update of Kimi K2 0711. It is a large-scale Mixture-of-Experts (MoE) language model developed by Moonshot AI, featuring 1 trillion total parameters with 32 billion active per forward pass. It supports long-context inference up to 256k tokens, extended from the previous 128k.  This update improves agentic coding with higher accuracy and better generalization across scaffolds, and enhances frontend coding with more aesthetic and functional outputs for web, 3D, and related tasks. Kimi K2 is optimized for agentic capabilities, including advanced tool use, reasoning, and code synthesis. It excels across coding (LiveCodeBench, SWE-bench), reasoning (ZebraLogic, GPQA), and tool-use (Tau2, AceBench) benchmarks. The model is trained with a novel stack incorporating the MuonClip optimizer for stable large-scale MoE training.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.4,"output_tokens":2.0,"cache_read_tokens":0.15000000000000002},"tags":["prompt_cache"]}},{"id":"black-forest-labs/FLUX-1-dev","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-1-dev","parent":null,"metadata":null},{"id":"sentence-transformers/all-MiniLM-L12-v2","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/all-MiniLM-L12-v2","parent":null,"metadata":null},{"id":"mistralai/Mistral-Small-24B-Instruct-2501","object":"model","created":0,"owned_by":"deepinfra","root":"mistralai/Mistral-Small-24B-Instruct-2501","parent":null,"metadata":{"description":"Mistral Small 3 is a 24B-parameter language model optimized for low-latency performance across common AI tasks. Released under the Apache 2.0 license, it features both pre-trained and instruction-tuned versions designed for efficient local deployment.  The model achieves 81% accuracy on the MMLU benchmark and performs competitively with larger models like Llama 3.3 70B and Qwen 32B, while operating at three times the speed on equivalent hardware.","context_length":32768,"max_tokens":32768,"pricing":{"input_tokens":0.05,"output_tokens":0.08},"tags":[]}},{"id":"Qwen/Qwen3-Embedding-8B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-8B","parent":null,"metadata":null},{"id":"Qwen/Qwen3-Embedding-4B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-4B","parent":null,"metadata":null},{"id":"Bria/replace_background","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/replace_background","parent":null,"metadata":null},{"id":"Qwen/Qwen3-32B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-32B","parent":null,"metadata":{"description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support","context_length":40960,"max_tokens":40960,"pricing":{"input_tokens":0.08,"output_tokens":0.28},"tags":["reasoning_effort","reasoning"]}},{"id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","parent":null,"metadata":{"description":"NVIDIA-Nemotron-Nano-9B-v2 is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. It responds to user queries and tasks by first generating a reasoning trace and then concluding with a final response.  The model's reasoning capabilities can be controlled via a system prompt. If the user prefers the model to provide its final answer without intermediate reasoning traces, it can be configured to do so.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.04,"output_tokens":0.16},"tags":["reasoning"]}},{"id":"sentence-transformers/multi-qa-mpnet-base-dot-v1","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/multi-qa-mpnet-base-dot-v1","parent":null,"metadata":null},{"id":"BAAI/bge-base-en-v1.5","object":"model","created":0,"owned_by":"deepinfra","root":"BAAI/bge-base-en-v1.5","parent":null,"metadata":null},{"id":"intfloat/multilingual-e5-large","object":"model","created":0,"owned_by":"deepinfra","root":"intfloat/multilingual-e5-large","parent":null,"metadata":null},{"id":"Sao10K/L3.1-70B-Euryale-v2.2","object":"model","created":0,"owned_by":"deepinfra","root":"Sao10K/L3.1-70B-Euryale-v2.2","parent":null,"metadata":{"description":"Euryale 3.1 - 70B v2.2 is a model focused on creative roleplay from Sao10k","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.85,"output_tokens":0.85},"tags":[]}},{"id":"Bria/Bria-3.2","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/Bria-3.2","parent":null,"metadata":null},{"id":"ClarityAI/creative","object":"model","created":0,"owned_by":"deepinfra","root":"ClarityAI/creative","parent":null,"metadata":null},{"id":"allenai/Olmo-3.1-32B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"allenai/Olmo-3.1-32B-Instruct","parent":null,"metadata":{"description":"Olmo is a series of Open language models, developed by Allen Institute for AI (Ai2), designed to enable the science of language models. ","context_length":65536,"max_tokens":65536,"pricing":{"input_tokens":0.2,"output_tokens":0.6},"tags":[]}},{"id":"openai/gpt-oss-20b","object":"model","created":0,"owned_by":"deepinfra","root":"openai/gpt-oss-20b","parent":null,"metadata":{"description":"gpt-oss-20b is an open-weight 21B parameter model released by OpenAI under the Apache 2.0 license. It uses a Mixture-of-Experts (MoE) architecture with 3.6B active parameters per forward pass, optimized for lower-latency inference. The model is trained in OpenAI’s Harmony response format and supports reasoning level configuration, fine-tuning, and agentic capabilities including function calling, tool use, and structured outputs.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.030000000000000002,"output_tokens":0.14},"tags":["reasoning_effort","reasoning"]}},{"id":"deepseek-ai/DeepSeek-V3.1","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-V3.1","parent":null,"metadata":{"description":"DeepSeek-V3.1 is post-trained on the top of DeepSeek-V3.1-Base, which is built upon the original V3 base checkpoint through a two-phase long context extension approach, following the methodology outlined in the original DeepSeek-V3 report. We have expanded our dataset by collecting additional long documents and substantially extending both training phases. The 32K extension phase has been increased 10-fold to 630B tokens, while the 128K extension phase has been extended by 3.3x to 209B tokens. Additionally, DeepSeek-V3.1 is trained using the UE8M0 FP8 scale data format to ensure compatibility with microscaling data formats.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.21,"output_tokens":0.7899999999999999,"cache_read_tokens":0.1300000002},"tags":["prompt_cache","reasoning_effort","reasoning"]}},{"id":"nvidia/Nemotron-3-Nano-30B-A3B","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/Nemotron-3-Nano-30B-A3B","parent":null,"metadata":{"description":"NVIDIA Nemotron 3 Nano is an open small reasoning model optimized for fast, cost-efficient inference in agentic and production workloads. Built with a hybrid Mixture-of-Experts (MoE) and Mamba-Transformer architecture, it delivers strong multi-step reasoning, high token throughput, stable latency with predictable cost, and efficient deployment for agent-based systems. Designed for real-world AI systems where reasoning can generate significantly more tokens per prompt, Nemotron Nano reduces compute cost while maintaining strong reasoning quality.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.05,"output_tokens":0.2},"tags":["reasoning"]}},{"id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Llama-4-Scout-17B-16E-Instruct","parent":null,"metadata":{"description":"The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding. Llama 4 Scout, a 17 billion parameter model with 16 experts","context_length":327680,"max_tokens":327680,"pricing":{"input_tokens":0.08,"output_tokens":0.3},"tags":["vision"]}},{"id":"sentence-transformers/clip-ViT-B-32-multilingual-v1","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/clip-ViT-B-32-multilingual-v1","parent":null,"metadata":null},{"id":"nvidia/Llama-3.3-Nemotron-Super-49B-v1.5","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/Llama-3.3-Nemotron-Super-49B-v1.5","parent":null,"metadata":{"description":"Llama-3.3-Nemotron-Super-49B-v1.5 is a large language model (LLM) optimized for advanced reasoning, conversational interactions, retrieval-augmented generation (RAG), and tool-calling tasks. Derived from Meta's Llama-3.3-70B-Instruct, it employs a Neural Architecture Search (NAS) approach, significantly enhancing efficiency and reducing memory requirements. ","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.1,"output_tokens":0.4},"tags":["reasoning"]}},{"id":"Qwen/Qwen3.5-35B-A3B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-35B-A3B","parent":null,"metadata":{"description":"Qwen3.5-35B-A3B is an efficient Mixture-of-Experts model from Alibaba's Qwen3.5 series with 35B total parameters and only 3B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Delivers strong performance on reasoning, coding, and vision-language tasks at a fraction of the compute cost.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.21999999999999997,"output_tokens":2.2},"tags":["vision","reasoning_effort"]}},{"id":"black-forest-labs/FLUX-2-klein-4b","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-2-klein-4b","parent":null,"metadata":null},{"id":"Bria/Bria-3.2-vector","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/Bria-3.2-vector","parent":null,"metadata":null},{"id":"Bria/erase","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/erase","parent":null,"metadata":null},{"id":"deepseek-ai/Janus-Pro-1B","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/Janus-Pro-1B","parent":null,"metadata":null},{"id":"ClarityAI/flux","object":"model","created":0,"owned_by":"deepinfra","root":"ClarityAI/flux","parent":null,"metadata":null},{"id":"black-forest-labs/FLUX-2-max","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-2-max","parent":null,"metadata":null},{"id":"NousResearch/Hermes-3-Llama-3.1-70B","object":"model","created":0,"owned_by":"deepinfra","root":"NousResearch/Hermes-3-Llama-3.1-70B","parent":null,"metadata":{"description":"Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.3,"output_tokens":0.3},"tags":[]}},{"id":"NousResearch/Hermes-3-Llama-3.1-405B","object":"model","created":0,"owned_by":"deepinfra","root":"NousResearch/Hermes-3-Llama-3.1-405B","parent":null,"metadata":{"description":"Hermes 3 is a cutting-edge language model that offers advanced capabilities in roleplaying, reasoning, and conversation. It's a fine-tuned version of the Llama-3.1 405B foundation model, designed to align with user needs and provide powerful control. Key features include reliable function calling, structured output, generalist assistant capabilities, and improved code generation. Hermes 3 is competitive with Llama-3.1 Instruct models, with its own strengths and weaknesses.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":1.0,"output_tokens":1.0},"tags":[]}},{"id":"MiniMaxAI/MiniMax-M2.1","object":"model","created":0,"owned_by":"deepinfra","root":"MiniMaxAI/MiniMax-M2.1","parent":null,"metadata":{"description":"MiniMax-M2.1 is a model optimized specifically for robustness in coding, tool use, instruction following, and long-horizon planning. From automating multilingual software development to executing complex, multi-step office workflows, MiniMax-M2.1 empowers developers to build the next generation of autonomous applications—all while being fully transparent, controllable, and accessible.","context_length":196608,"max_tokens":196608,"pricing":{"input_tokens":0.27,"output_tokens":0.95,"cache_read_tokens":0.0290000007},"tags":["prompt_cache"]}},{"id":"black-forest-labs/FLUX-1-Redux-dev","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-1-Redux-dev","parent":null,"metadata":null},{"id":"BAAI/bge-m3-multi","object":"model","created":0,"owned_by":"deepinfra","root":"BAAI/bge-m3-multi","parent":null,"metadata":null},{"id":"Qwen/Qwen-Image-Edit-Max","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen-Image-Edit-Max","parent":null,"metadata":null},{"id":"stabilityai/sdxl-turbo","object":"model","created":0,"owned_by":"deepinfra","root":"stabilityai/sdxl-turbo","parent":null,"metadata":null},{"id":"Qwen/Qwen3-VL-235B-A22B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-VL-235B-A22B-Instruct","parent":null,"metadata":{"description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.2,"output_tokens":0.8799999999999999,"cache_read_tokens":0.11000000000000001},"tags":["vision","prompt_cache","reasoning_effort"]}},{"id":"BAAI/bge-m3","object":"model","created":0,"owned_by":"deepinfra","root":"BAAI/bge-m3","parent":null,"metadata":null},{"id":"moonshotai/Kimi-K2.5","object":"model","created":0,"owned_by":"deepinfra","root":"moonshotai/Kimi-K2.5","parent":null,"metadata":{"description":"Kimi K2.5 is an open-source, native multimodal agentic model built through continual pretraining on approximately 15 trillion mixed visual and text tokens atop Kimi-K2-Base. It seamlessly integrates vision and language understanding with advanced agentic capabilities, instant and thinking modes, as well as conversational and agentic paradigms.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.45,"output_tokens":2.25,"cache_read_tokens":0.070000002},"tags":["vision","prompt_cache","reasoning"]}},{"id":"Bria/remove_background","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/remove_background","parent":null,"metadata":null},{"id":"zai-org/GLM-4.7-Flash","object":"model","created":0,"owned_by":"deepinfra","root":"zai-org/GLM-4.7-Flash","parent":null,"metadata":{"description":"GLM-4.7-Flash is a 30B-A3B MoE model. As the strongest model in the 30B class, GLM-4.7-Flash offers a new option for lightweight deployment that balances performance and efficiency.","context_length":202752,"max_tokens":202752,"pricing":{"input_tokens":0.060000000000000005,"output_tokens":0.4,"cache_read_tokens":0.0100000002},"tags":["prompt_cache","reasoning"]}},{"id":"deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-R1-Distill-Llama-70B","parent":null,"metadata":{"description":"DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks. ","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.7,"output_tokens":0.8},"tags":["reasoning"]}},{"id":"microsoft/phi-4","object":"model","created":0,"owned_by":"deepinfra","root":"microsoft/phi-4","parent":null,"metadata":{"description":"Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.","context_length":16384,"max_tokens":16384,"pricing":{"input_tokens":0.07,"output_tokens":0.14},"tags":[]}},{"id":"intfloat/multilingual-e5-large-instruct","object":"model","created":0,"owned_by":"deepinfra","root":"intfloat/multilingual-e5-large-instruct","parent":null,"metadata":null},{"id":"openai/gpt-oss-120b","object":"model","created":0,"owned_by":"deepinfra","root":"openai/gpt-oss-120b","parent":null,"metadata":{"description":"gpt-oss-120b is an open-weight, 117B-parameter Mixture-of-Experts (MoE) language model from OpenAI designed for high-reasoning, agentic, and general-purpose production use cases. The model supports configurable reasoning depth, full chain-of-thought access, and native tool use, including function calling, browsing, and structured output generation.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.039,"output_tokens":0.19},"tags":["reasoning_effort","reasoning"]}},{"id":"Qwen/Qwen-Image-Edit","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen-Image-Edit","parent":null,"metadata":null},{"id":"MiniMaxAI/MiniMax-M2.5","object":"model","created":0,"owned_by":"deepinfra","root":"MiniMaxAI/MiniMax-M2.5","parent":null,"metadata":{"description":"MiniMax M2.5 is SOTA in coding, agentic tool use and search, office work, and a range of other economically valuable tasks, boasting scores of 80.2% in SWE-Bench Verified, 51.3% in Multi-SWE-Bench, and 76.3% in BrowseComp (with context management).","context_length":196608,"max_tokens":196608,"pricing":{"input_tokens":0.27,"output_tokens":0.95,"cache_read_tokens":0.029999999700000002},"tags":["prompt_cache","reasoning"]}},{"id":"black-forest-labs/FLUX-2-pro","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-2-pro","parent":null,"metadata":null},{"id":"Qwen/Qwen3-Embedding-0.6B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-0.6B","parent":null,"metadata":null},{"id":"black-forest-labs/FLUX-1-schnell","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-1-schnell","parent":null,"metadata":null},{"id":"google/gemma-3-12b-it","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemma-3-12b-it","parent":null,"metadata":{"description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3-12B is Google's latest open source model, successor to Gemma 2","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.04,"output_tokens":0.13},"tags":["vision"]}},{"id":"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo","parent":null,"metadata":{"description":"Qwen3-Coder-480B-A35B-Instruct is the Qwen3's most agentic code model, featuring Significant Performance on Agentic Coding, Agentic Browser-Use and other foundational coding tasks, achieving results comparable to Claude Sonnet.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.21999999999999997,"output_tokens":1.0,"cache_read_tokens":0.022},"tags":["prompt_cache"]}},{"id":"deepseek-ai/DeepSeek-V3-0324","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-V3-0324","parent":null,"metadata":{"description":"DeepSeek-V3-0324, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token, an improved iteration over DeepSeek-V3.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.2,"output_tokens":0.77,"cache_read_tokens":0.135},"tags":["prompt_cache"]}},{"id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B","parent":null,"metadata":{"description":"NVIDIA Nemotron 3 Super is a hybrid Mixture-of-Experts (MoE) model engineered for highest compute efficiency and accuracy in multi-agent applications and specialized agentic systems. It is optimized to run many collaborating agents per application on a single GPU, delivering high accuracy for reasoning, tool use, and instruction following.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.1,"output_tokens":0.5,"cache_read_tokens":0.1},"tags":["prompt_cache","reasoning"]}},{"id":"google/gemini-1.5-flash","object":"model","created":0,"owned_by":"deepinfra","root":"google/gemini-1.5-flash","parent":null,"metadata":{"description":"Gemini 1.5 Flash is Google's foundation model that performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video. It's adept at processing visual and text inputs such as photographs, documents, infographics, and screenshots.  Gemini 1.5 Flash is designed for high-volume, high-frequency tasks where cost and latency matter. ","context_length":1000000,"max_tokens":1000000,"pricing":{"input_tokens":0.075,"output_tokens":0.3},"tags":["vision","reasoning_effort"]}},{"id":"Bria/fibo","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/fibo","parent":null,"metadata":null},{"id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL","parent":null,"metadata":{"description":"NVIDIA Nemotron 2 Nano VL extends the Nemotron family into multi-modal reasoning and document intelligence. This auto-regressive vision-language model enables multi-image reasoning, video understanding, visual Q&A and document analysis and summarization. Optimized for enterprise AI workflows, it powers multimodal agentic systems such as visual copilots, document assistants, and knowledge automation pipelines.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.2,"output_tokens":0.6},"tags":["vision","reasoning"]}},{"id":"zai-org/GLM-4.7","object":"model","created":0,"owned_by":"deepinfra","root":"zai-org/GLM-4.7","parent":null,"metadata":{"description":"GLM-4.7 is a state-of-the-art, multilingual Mixture-of-Experts (MoE) language model designed for complex reasoning, agentic coding, and tool use. Building on its predecessor GLM-4.6, it delivers significant improvements across key benchmarks, including multilingual SWE-bench, Terminal Bench, and reasoning-heavy evaluations like HLE. The model features advanced \"Interleaved Thinking\" and new \"Preserved Thinking\" modes, allowing it to reason before actions and maintain consistency across long, multi-turn tasks. With 358 billion parameters, GLM-4.7 excels in generating clean code, modern UI elements, and sophisticated reasoning outputs.","context_length":202752,"max_tokens":202752,"pricing":{"input_tokens":0.4,"output_tokens":1.75,"cache_read_tokens":0.08000000000000002},"tags":["prompt_cache","reasoning"]}},{"id":"intfloat/e5-base-v2","object":"model","created":0,"owned_by":"deepinfra","root":"intfloat/e5-base-v2","parent":null,"metadata":null},{"id":"Qwen/Qwen3-235B-A22B-Thinking-2507","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-235B-A22B-Thinking-2507","parent":null,"metadata":{"description":"Qwen3-235B-A22B-Thinking-2507 is the Qwen3's new model with scaling the thinking capability of Qwen3-235B-A22B, improving both the quality and depth of reasoning. ","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.22999999999999998,"output_tokens":2.3,"cache_read_tokens":0.20000000059999998},"tags":["prompt_cache","reasoning_effort","reasoning"]}},{"id":"Qwen/Qwen3-Embedding-4B-batch","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-4B-batch","parent":null,"metadata":null},{"id":"anthropic/claude-4-sonnet","object":"model","created":0,"owned_by":"deepinfra","root":"anthropic/claude-4-sonnet","parent":null,"metadata":{"description":"Anthropic's mid-size model with superior intelligence for high-volume uses in coding, in-depth research, agents, & more.","context_length":200000,"max_tokens":200000,"pricing":{"input_tokens":3.3000000000000003,"output_tokens":16.5},"tags":["vision","reasoning_effort"]}},{"id":"anthropic/claude-3-7-sonnet-latest","object":"model","created":0,"owned_by":"deepinfra","root":"anthropic/claude-3-7-sonnet-latest","parent":null,"metadata":{"description":"","context_length":200000,"max_tokens":200000,"pricing":{"input_tokens":3.3000000000000003,"output_tokens":16.5,"cache_read_tokens":0.33000000000000007},"tags":["vision","prompt_cache","reasoning_effort"]}},{"id":"black-forest-labs/FLUX-2-dev","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-2-dev","parent":null,"metadata":null},{"id":"Qwen/Qwen3-Embedding-8B-batch","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Embedding-8B-batch","parent":null,"metadata":null},{"id":"black-forest-labs/FLUX-pro","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX-pro","parent":null,"metadata":null},{"id":"Qwen/Qwen2.5-72B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen2.5-72B-Instruct","parent":null,"metadata":{"description":"Qwen2.5 is a model pretrained on a large-scale dataset of up to 18 trillion tokens, offering significant improvements in knowledge, coding, mathematics, and instruction following compared to its predecessor Qwen2. The model also features enhanced capabilities in generating long texts, understanding structured data, and generating structured outputs, while supporting multilingual capabilities for over 29 languages.","context_length":32768,"max_tokens":32768,"pricing":{"input_tokens":0.12000000000000001,"output_tokens":0.38999999999999996},"tags":[]}},{"id":"sentence-transformers/all-MiniLM-L6-v2","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/all-MiniLM-L6-v2","parent":null,"metadata":null},{"id":"ByteDance/Seed-1.8","object":"model","created":0,"owned_by":"deepinfra","root":"ByteDance/Seed-1.8","parent":null,"metadata":{"description":"Optimized specifically for multimodal agent scenarios. It features enhanced agent capabilities, upgraded multimodal comprehension, and more flexible context management.","context_length":256000,"max_tokens":256000,"pricing":{"input_tokens":0.25,"output_tokens":2.0,"cache_read_tokens":0.05},"tags":["vision","prompt_cache","reasoning"]}},{"id":"Qwen/Qwen3-Max","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-Max","parent":null,"metadata":{"description":"The latest flagship model in the Qwen family. State-of-the-art results across a comprehensive suite of benchmarks — including knowledge, reasoning, coding, instruction following, human preference alignment, agent tasks, and multilingual understanding.","context_length":256000,"max_tokens":256000,"pricing":{"input_tokens":1.2,"output_tokens":5.999999999999999,"cache_read_tokens":0.24},"tags":["prompt_cache","reasoning_effort"]}},{"id":"Qwen/Qwen3-14B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3-14B","parent":null,"metadata":{"description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. ","context_length":40960,"max_tokens":40960,"pricing":{"input_tokens":0.12000000000000001,"output_tokens":0.24000000000000002},"tags":["reasoning_effort","reasoning"]}},{"id":"black-forest-labs/FLUX.1-Kontext-dev","object":"model","created":0,"owned_by":"deepinfra","root":"black-forest-labs/FLUX.1-Kontext-dev","parent":null,"metadata":null},{"id":"sentence-transformers/paraphrase-MiniLM-L6-v2","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/paraphrase-MiniLM-L6-v2","parent":null,"metadata":null},{"id":"shibing624/text2vec-base-chinese","object":"model","created":0,"owned_by":"deepinfra","root":"shibing624/text2vec-base-chinese","parent":null,"metadata":null},{"id":"nvidia/llama-nemotron-embed-vl-1b-v2","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/llama-nemotron-embed-vl-1b-v2","parent":null,"metadata":null},{"id":"google/embeddinggemma-300m","object":"model","created":0,"owned_by":"deepinfra","root":"google/embeddinggemma-300m","parent":null,"metadata":null},{"id":"meta-llama/Llama-Guard-4-12B","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Llama-Guard-4-12B","parent":null,"metadata":{"description":"Llama Guard 4 is a natively multimodal safety classifier with 12 billion parameters trained jointly on text and multiple images. Llama Guard 4 is a dense architecture pruned from the Llama 4 Scout pre-trained model and fine-tuned for content safety classification. Similar to previous versions, it can be used to classify content in both LLM inputs (prompt classification) and in LLM responses (response classification). It itself acts as an LLM: it generates text in its output that indicates whether a given prompt or response is safe or unsafe, and if unsafe, it also lists the content categories violated.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.18,"output_tokens":0.18},"tags":["vision"]}},{"id":"Qwen/Qwen2.5-VL-32B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen2.5-VL-32B-Instruct","parent":null,"metadata":{"description":"","context_length":128000,"max_tokens":128000,"pricing":{"input_tokens":0.2,"output_tokens":0.6},"tags":["vision"]}},{"id":"nvidia/Llama-3.1-Nemotron-70B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"nvidia/Llama-3.1-Nemotron-70B-Instruct","parent":null,"metadata":{"description":"Llama-3.1-Nemotron-70B-Instruct is a large language model customized by NVIDIA to improve the helpfulness of LLM generated responses to user queries. This model reaches Arena Hard of 85.0, AlpacaEval 2 LC of 57.6 and GPT-4-Turbo MT-Bench of 8.98, which are known to be predictive of LMSys Chatbot Arena Elo.  As of 16th Oct 2024, this model is #1 on all three automatic alignment benchmarks (verified tab for AlpacaEval 2 LC), edging out strong frontier models such as GPT-4o and Claude 3.5 Sonnet.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":1.2,"output_tokens":1.2},"tags":[]}},{"id":"ClarityAI/crystal","object":"model","created":0,"owned_by":"deepinfra","root":"ClarityAI/crystal","parent":null,"metadata":null},{"id":"Bria/blur_background","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/blur_background","parent":null,"metadata":null},{"id":"meta-llama/Meta-Llama-3-8B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Meta-Llama-3-8B-Instruct","parent":null,"metadata":{"description":"Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes.","context_length":8192,"max_tokens":8192,"pricing":{"input_tokens":0.030000000000000002,"output_tokens":0.04},"tags":[]}},{"id":"ByteDance/Seed-2.0-pro","object":"model","created":0,"owned_by":"deepinfra","root":"ByteDance/Seed-2.0-pro","parent":null,"metadata":{"description":"Built for the Agent era, it delivers stable performance in complex reasoning and long-horizon tasks, including multi-step planning, visual-text reasoning, video understanding, and advanced analysis.","context_length":256000,"max_tokens":256000,"pricing":{"input_tokens":0.5,"output_tokens":2.9999999999999996,"cache_read_tokens":0.1},"tags":["vision","prompt_cache","reasoning"]}},{"id":"Qwen/Qwen-Image-Max","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen-Image-Max","parent":null,"metadata":null},{"id":"Wan-AI/Wan2.6-Image-Edit","object":"model","created":0,"owned_by":"deepinfra","root":"Wan-AI/Wan2.6-Image-Edit","parent":null,"metadata":null},{"id":"Qwen/Qwen3.5-9B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-9B","parent":null,"metadata":{"description":"Qwen3.5-9B is a high-performance model from Alibaba's Qwen3.5 series with a hybrid Gated Delta Networks and sparse MoE architecture. It features a 262K token context window, thinking/reasoning mode, tool calling, multi-token prediction, and support for 201 languages. Excels at reasoning, coding, instruction following, and long-context tasks.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.04,"output_tokens":0.2},"tags":["vision","reasoning_effort"]}},{"id":"Bria/erase_foreground","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/erase_foreground","parent":null,"metadata":null},{"id":"ByteDance/Seed-2.0-mini","object":"model","created":0,"owned_by":"deepinfra","root":"ByteDance/Seed-2.0-mini","parent":null,"metadata":{"description":"Built for low-latency, high-concurrency, cost-sensitive use cases, with flexible deployment, four-tier thinking, and multimodal","context_length":256000,"max_tokens":256000,"pricing":{"input_tokens":0.1,"output_tokens":0.4,"cache_read_tokens":0.020000000000000004},"tags":["vision","prompt_cache","reasoning"]}},{"id":"sentence-transformers/clip-ViT-B-32","object":"model","created":0,"owned_by":"deepinfra","root":"sentence-transformers/clip-ViT-B-32","parent":null,"metadata":null},{"id":"Bria/gen_fill","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/gen_fill","parent":null,"metadata":null},{"id":"Qwen/Qwen3.5-2B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-2B","parent":null,"metadata":{"description":"Qwen3.5-2B is a compact yet capable model from Alibaba's Qwen3.5 series. It features a 262K token context window, support for 201 languages, thinking/reasoning mode, and tool calling for agentic workflows. A strong choice for prototyping, fine-tuning, and efficient multilingual deployments.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.02,"output_tokens":0.1},"tags":["vision","reasoning_effort"]}},{"id":"meta-llama/Meta-Llama-3.1-8B-Instruct","object":"model","created":0,"owned_by":"deepinfra","root":"meta-llama/Meta-Llama-3.1-8B-Instruct","parent":null,"metadata":{"description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.02,"output_tokens":0.05},"tags":[]}},{"id":"zai-org/GLM-4.6V","object":"model","created":0,"owned_by":"deepinfra","root":"zai-org/GLM-4.6V","parent":null,"metadata":{"description":"This model is part of the GLM-V family of models, introduced in the paper GLM-4.1V-Thinking and GLM-4.5V: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning.","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.3,"output_tokens":0.9},"tags":["vision","reasoning"]}},{"id":"intfloat/e5-large-v2","object":"model","created":0,"owned_by":"deepinfra","root":"intfloat/e5-large-v2","parent":null,"metadata":null},{"id":"PrunaAI/p-image","object":"model","created":0,"owned_by":"deepinfra","root":"PrunaAI/p-image","parent":null,"metadata":null},{"id":"Bria/expand","object":"model","created":0,"owned_by":"deepinfra","root":"Bria/expand","parent":null,"metadata":null},{"id":"PaddlePaddle/PaddleOCR-VL-0.9B","object":"model","created":0,"owned_by":"deepinfra","root":"PaddlePaddle/PaddleOCR-VL-0.9B","parent":null,"metadata":{"description":"PaddleOCR-VL is a SOTA and resource-efficient model tailored for document parsing. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful vision-language model (VLM) that integrates a NaViT-style dynamic resolution visual encoder with the ERNIE-4.5-0.3B language model to enable accurate element recognition. This innovative model efficiently supports 109 languages and excels in recognizing complex elements (e.g., text, tables, formulas, and charts), while maintaining minimal resource consumption. Through comprehensive evaluations on widely used public benchmarks and in-house benchmarks, PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing solutions, exhibits strong competitiveness against top-tier VLMs, and delivers fast inference speeds. These strengths make it highly suitable for practical deployment in real-world scenarios.","context_length":16384,"max_tokens":16384,"pricing":{"input_tokens":0.14,"output_tokens":0.8},"tags":["vision"]}},{"id":"openai/gpt-oss-120b-Turbo","object":"model","created":0,"owned_by":"deepinfra","root":"openai/gpt-oss-120b-Turbo","parent":null,"metadata":{"description":"","context_length":131072,"max_tokens":131072,"pricing":{"input_tokens":0.15,"output_tokens":0.6},"tags":["reasoning_effort","reasoning"]}},{"id":"deepseek-ai/DeepSeek-V3.2","object":"model","created":0,"owned_by":"deepinfra","root":"deepseek-ai/DeepSeek-V3.2","parent":null,"metadata":{"description":"DeepSeek-V3.2 is a large language model designed to harmonize high computational efficiency with strong reasoning and agentic tool-use performance. It introduces DeepSeek Sparse Attention (DSA), a fine-grained sparse attention mechanism that reduces training and inference cost while preserving quality in long-context scenarios. A scalable reinforcement learning post-training framework further improves reasoning, with reported performance in the GPT-5 class, and the model has demonstrated gold-medal results on the 2025 IMO and IOI. V3.2 also uses a large-scale agentic task synthesis pipeline to better integrate reasoning into tool-use settings, boosting compliance and generalization in interactive environments.","context_length":163840,"max_tokens":163840,"pricing":{"input_tokens":0.26,"output_tokens":0.38,"cache_read_tokens":0.13},"tags":["prompt_cache"]}},{"id":"Qwen/Qwen3.5-0.8B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-0.8B","parent":null,"metadata":{"description":"Qwen3.5-0.8B is Alibaba's smallest model in the Qwen3.5 series, featuring a hybrid Gated Delta Networks and sparse Mixture-of-Experts architecture. Despite its compact size, it supports a 262K token context window, 201 languages, thinking/reasoning mode, and tool calling. Ideal for edge deployments, resource-constrained environments, and lightweight inference tasks.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.01,"output_tokens":0.05},"tags":["vision","reasoning_effort"]}},{"id":"mistralai/Mixtral-8x7B-Instruct-v0.1","object":"model","created":0,"owned_by":"deepinfra","root":"mistralai/Mixtral-8x7B-Instruct-v0.1","parent":null,"metadata":{"description":"Mixtral is mixture of expert large language model (LLM) from Mistral AI. This is state of the art machine learning model using a mixture 8 of experts (MoE) 7b models. During inference 2 expers are selected. This architecture allows large models to be fast and cheap at inference. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks.","context_length":32768,"max_tokens":32768,"pricing":{"input_tokens":0.54,"output_tokens":0.54},"tags":[]}},{"id":"Qwen/Qwen3.5-4B","object":"model","created":0,"owned_by":"deepinfra","root":"Qwen/Qwen3.5-4B","parent":null,"metadata":{"description":"Qwen3.5-4B is a mid-size model from Alibaba's Qwen3.5 series that delivers a strong balance of performance and efficiency. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Well-suited for complex reasoning, code generation, and agentic applications.","context_length":262144,"max_tokens":262144,"pricing":{"input_tokens":0.030000000000000002,"output_tokens":0.15},"tags":["vision","reasoning_effort"]}}]}