[{"model_name":"openai/whisper-large-v3","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.","cover_img_url":"https://shared.deepinfra.com/models/openai/whisper-large-v3/cover_image.fe8f2de15e8bd36b600918e0f1eae37afe05b3e7d274d085553f5f5fab1bebbc.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_length","cents_per_input_sec":0.00075},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-07-17T21:49:19+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-1-Redux-dev","type":"text-to-image","reported_type":"text-to-image","description":"FLUX.1 Redux [dev] is an image variation generation adapter for all FLUX.1 base models. It enables users to refine images with slight variations and supports text-based restyling via API. Integrated with FLUX1.1 [pro] Ultra, it allows for high-quality 4-megapixel outputs. The model can be used with Diffusers in Python for efficient image generation. While powerful, it has ethical and factual limitations and is governed by a non-commercial license.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-1-Redux-dev/cover_image.b29f2ca852db3b8be1c5a865a3959c95f6982e09e8c53b6f55ad587ba7d855eb.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.2,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-24T15:29:11+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_mask_by_prompt","type":"text-to-video","reported_type":"text-to-video","description":"Identify and segment objects across video frames using a text prompt. The easiest way to create a mask to modify your videos.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_mask_by_prompt/cover_image.1b27bd48f347277f90a1e064a2dba09346aa6c3fef056765a0e2adbb4cf160cd.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":14.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:11:25+00:00","private":0,"is_partner":true},{"model_name":"meta-llama/Llama-3.2-1B-Instruct","type":"text-generation","reported_type":"text-generation","description":"The Meta Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned generative models in 1B and 3B sizes (text in/text out).","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.2-1B-Instruct/cover_image.263b22ba8551018d6792180359cadf259a4bfb41b8db163f738a304a6186d8be.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-07,"cents_per_output_token":1e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"meta-llama/Llama-3.2-11B-Vision-Instruct","deprecated":1773101029,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-09-26T23:16:05+00:00","private":0,"is_partner":false},{"model_name":"Pixverse/Pixverse-T2V-HD","type":"text-to-video","reported_type":"text-to-video","description":"The 1080p high-fidelity mode in PixVerse renders videos with significantly enhanced sharpness and visual clarity, capturing intricate details and providing a crisp, professional-grade quality suitable for more polished projects.","cover_img_url":"https://shared.deepinfra.com/models/Pixverse/Pixverse-T2V-HD/cover_image.9111220927c06119f7341b36ddfe4355261f72ff2e8fe62c9b448c8ce262cf64.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":40.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-23T15:43:31+00:00","private":0,"is_partner":true},{"model_name":"bosonai/HiggsAudioV2.5","type":"text-to-speech","reported_type":"text-to-speech","description":"HiggsAudioV2.5 is a high-quality neural text-to-speech (TTS) model designed for natural-sounding voice generation across a wide range of use cases. It focuses on clarity, stable prosody, and consistent pacing, making it suitable for both short prompts and longer narration.","cover_img_url":"https://shared.deepinfra.com/models/bosonai/HiggsAudioV2.5/cover_image.215a171eb30340e08eb631878a1b4a8657d2a9c32822d7cf488e598e5b4a8dd5.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.002},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-16T23:56:30+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mixtral-8x22B-Instruct-v0.1","type":"text-generation","reported_type":"text-generation","description":"This is the instruction fine-tuned version of Mixtral-8x22B - the latest and largest mixture of experts large language model (LLM) from Mistral AI. This state of the art machine learning model uses a mixture 8 of experts (MoE) 22b models. During inference 2 experts are selected. This architecture allows large models to be fast and cheap at inference.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mixtral-8x22B-Instruct-v0.1/cover_image.8bb1b015367a1537fd23c69d5b8117675a86b207c9bd3cce326b750ef877bcb6.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6.5e-05,"cents_per_output_token":6.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":65536,"replaced_by":"deepseek-ai/DeepSeek-V3.2","deprecated":1771449048,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-04-17T19:07:17+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-sonnet-4-6","type":"text-generation","reported_type":"text-generation","description":"Claude Sonnet 4.6 delivers frontier intelligence at scale—built for coding, agents, and enterprise workflows.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-sonnet-4-6/cover_image.eb8fe98327537b626c115e8e17b5670154e60083b82bcf56821263f376bb8490.webp","tags":["multimodal","cc-native","reasoning","json","structured-output","tools","ocr","no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0003,"cents_per_output_token":0.0015,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-13T16:57:59+00:00","private":0,"is_partner":true},{"model_name":"inworld-ai/realtime-tts-1.5-mini","type":"text-to-speech","reported_type":"text-to-speech","description":"Fast multilingual text-to-speech model by Inworld AI with 130+ preset voices across 15 languages. Supports voice cloning, word-level timestamps, and streaming. Optimized for low-latency applications with <130ms time-to-first-audio.","cover_img_url":"https://shared.deepinfra.com/models/inworld-ai/realtime-tts-1.5-mini/cover_image.bc7fa4a01efac7cb31b6e68ea482f3f3f18ae2d46b12bf57b872acc05e5dc115.webp","tags":["voice","no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0025},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-04T22:17:23+00:00","private":0,"is_partner":true},{"model_name":"black-forest-labs/FLUX-2-max","type":"text-to-image","reported_type":"text-to-image","description":"The new top-tier image model from Black Forest Labs, significantly pushing image quality and editing consistency","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-2-max/cover_image.e62a755696c193f02011a7395b2c0cfcb2727967897a922046f819cdc458c94d.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":10.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":7.0,"usage_from_cost":true},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-12-16T12:11:12+00:00","private":0,"is_partner":true},{"model_name":"zai-org/GLM-4.6","type":"text-generation","reported_type":"text-generation","description":"Compared with GLM-4.5, GLM-4.6 brings several key improvements:  Longer context window: The context window has been expanded from 128K to 200K tokens, enabling the model to handle more complex agentic tasks. Superior coding performance: The model achieves higher scores on code benchmarks and demonstrates better real-world performance in applications such as Claude Code、Cline、Roo Code and Kilo Code, including improvements in generating visually polished front-end pages. Advanced reasoning: GLM-4.6 shows a clear improvement in reasoning performance and supports tool use during inference, leading to stronger overall capability. More capable agents: GLM-4.6 exhibits stronger performance in tool using and search-based agents, and integrates more effectively within agent frameworks. Refined writing: Better aligns with human preferences in style and readability, and performs more naturally in role-playing scenarios.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.6/cover_image.34ee1ba13d69b6c0344976bedf9742050590e550b4452f33a9c6da5d7a4e70db.webp","tags":["openai","tools","reasoning","can-disable-reasoning","structured-output","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.3e-05,"cents_per_output_token":0.000174,"rate_per_input_token_cached":0.18604651,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":202752,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-09-30T20:44:07+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-3.2-11B-Vision-Instruct","type":"text-generation","reported_type":"text-generation","description":"Llama 3.2 11B Vision is a multimodal model with 11 billion parameters, designed to handle tasks combining visual and textual data. It excels in tasks such as image captioning and visual question answering, bridging the gap between language generation and visual reasoning. Pre-trained on a massive dataset of image-text pairs, it performs well in complex, high-accuracy image analysis.  Its ability to integrate visual understanding with language processing makes it an ideal solution for industries requiring comprehensive visual-linguistic AI applications, such as content creation, AI-driven customer service, and research.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.2-11B-Vision-Instruct/cover_image.cb5529677d0ddf8aef17a4f2f10390cf54a0ceb934f0c0d88c12f7b1bcef0704.webp","tags":["ocr","openai","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.45e-05,"cents_per_output_token":3.45e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-09-26T21:46:21+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen-Image-Edit-Max","type":"text-to-image","reported_type":"text-to-image","description":"Enhanced industrial design and geometric reasoning, improved character consistency, reduced offset issues, and integrated LoRA capabilities","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen-Image-Edit-Max/cover_image.38213b95109e9548ac46b7a3b658a4b6c5952b6c8253aab9508a334467e123ea.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":7.5,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-18T12:44:07+00:00","private":0,"is_partner":true},{"model_name":"Bria/Bria-3.2","type":"text-to-image","reported_type":"text-to-image","description":"Bria 3.2 is the next-generation commercial-ready text-to-image model. With just 4 billion parameters, it provides exceptional aesthetics and text rendering, evaluated to be on par to leading open-source models, and outperforming other licensed models.","cover_img_url":"https://shared.deepinfra.com/models/Bria/Bria-3.2/cover_image.5b0eba958098c095ec7d54ba85da3523838b525af4b9387d4683ffbe7cf05722.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-11T12:52:08+00:00","private":0,"is_partner":true},{"model_name":"lizpreciatior/lzlv_70b_fp16_hf","type":"text-generation","reported_type":"text-generation","description":"A Mythomax/MLewd_13B-style merge of selected 70B models  A multi-model merge of several  LLaMA2 70B finetunes for roleplaying and creative work. The goal was to create a model that combines creativity with intelligence for an enhanced experience.","cover_img_url":"https://shared.deepinfra.com/models/lizpreciatior/lzlv_70b_fp16_hf/cover_image.2bb893141f7dce176afce500c4ec8ca22cfe5e2b00253d997fea31a7f60adc1b.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Llama-3.3-70B-Instruct-Turbo","deprecated":1762392581,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-10-25T01:46:29+00:00","private":0,"is_partner":false},{"model_name":"XiaomiMiMo/MiMo-V2.5-tts-voiceclone","type":"text-to-speech","reported_type":"text-to-speech","description":"Automatically convert input text into natural and fluent speech output. You can generate natural and vivid speech content by configuring parameters such as speech. Precisely replicate voices from audio samples to enable speech synthesis of any voice. style and voice.","cover_img_url":"https://shared.deepinfra.com/models/XiaomiMiMo/MiMo-V2.5-tts-voiceclone/cover_image.d52a6b6c7f10ab342edc1379f84d6e63d2567726abc2be966b8bc5c9a88935fe.webp","tags":["voice","no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-07T12:58:34+00:00","private":0,"is_partner":true},{"model_name":"nvidia/Cosmos3-Super","type":"text-to-video","reported_type":"world-model","description":"Cosmos3 is a world foundation model that unifies understanding and generation within a single Mixture-of-Transformer (MoT) architecture. Two tightly coupled towers—a Reasoner (vision-language model) and a Generator (world simulator)—share latent representations so that structured perception directly grounds realistic, temporally consistent simulation.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Cosmos3-Super/cover_image.db00284b101ef095cfb4ebdc677fae6aa57b6a771354c19100eedb90907069a1.webp","tags":["openai"],"pricing":{"short":"$0.0432 / second (480p)","full":null,"table":null,"type":"frame_units","cents_per_frame_unit":0.18},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-03T16:39:02+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen2.5-7B-Instruct","type":"text-generation","reported_type":"text-generation","description":"The 7 billion parameter Qwen2.5 excels in language understanding, multilingual capabilities, coding, mathematics, and reasoning","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2.5-7B-Instruct/cover_image.f1f5fd736a4e6bd9c33961de8cc74acf677c130545e1beece8cd0b06c3c69fea.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-14B","deprecated":1762392581,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-11-12T23:21:23+00:00","private":0,"is_partner":false},{"model_name":"runwayml/stable-diffusion-v1-5","type":"text-to-image","reported_type":"text-to-image","description":"Most widely used version of Stable Diffusion. Trained on 512x512 images, it can generate realistic images given text description","cover_img_url":"https://shared.deepinfra.com/models/runwayml/stable-diffusion-v1-5/cover_image.8f31bd46420a8a6678deb05ab40e7ca5400f5d0620dda18ad1acc938727b5d45.jpg","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1727456682,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-17T02:03:21+00:00","private":0,"is_partner":false},{"model_name":"allenai/olmOCR-7B-0825","type":"text-generation","reported_type":"text-generation","description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","cover_img_url":"https://shared.deepinfra.com/models/allenai/olmOCR-7B-0825/cover_image.345ea3325a1ac51b1fb2b456b76257452b87ebbda2612a4807a9fcc97a87a5a2.webp","tags":["openai","multimodal","cc-native","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120282,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-09-26T22:38:31+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2.7-Code","type":"text-generation","reported_type":"text-generation","description":"Kimi K2.7 Code is a coding-focused agentic model built upon Kimi K2.6. With substantial improvements on real-world long-horizon coding tasks, it strengthens end-to-end task completion across complex software engineering workflows while improving token efficiency, reducing thinking-token usage by approximately 30% compared with Kimi K2.6.","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2.7-Code/cover_image.af1b9be23582945940e6c5c2d202754d1c4f187fd5d72ac06c0138fe52f95288.webp","tags":["openai","can-disable-reasoning","multimodal","json","tools","structured-output","reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7.4e-05,"cents_per_output_token":0.00035,"rate_per_input_token_cached":0.2027027,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-06-15T23:40:40+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-TTS","type":"text-to-speech","reported_type":"text-to-speech","description":"  Qwen3-TTS is an advanced text-to-speech model by Alibaba's Qwen team, delivering stable, expressive, and low-latency speech generation across 10 languages.                                                                                                                                                                                                                                                                                                                                           Key capabilities:                                                                                                                                                                                                                                  - 9 preset voices — Vivian, Serena, Uncle_Fu, Dylan, Eric, Ryan, Aiden, Ono_Anna, Sohee — covering diverse genders, ages, and accents                                                                                                              - Voice cloning — clone any voice from a short (~3s) audio sample via the voice_id parameter   - Instruction control — adjust tone, emotion, and speaking style with natural language (e.g. \"speak slowly and calmly\", \"excited tone\")   - 10 languages — English, Chinese, Japanese, Korean, German, French, Russian, Spanish, Italian, Portuguese   - Streaming support — real-time PCM streaming with ~97ms first-byte latency   - Multiple output formats — WAV, MP3, FLAC, PCM    Built on a 1.7B parameter architecture using discrete multi-codebook language modeling for end-to-end speech synthesis without cascading errors. Uses a custom 12Hz acoustic tokenizer that preserves paralinguistic information and   environmental audio details.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-TTS/cover_image.b34adcc17f3762066c93d008682485f5b083ab7712d54603fdaeab9bac19f5ed.webp","tags":["voice","openai","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.002},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-06T10:10:47+00:00","private":0,"is_partner":false},{"model_name":"XiaomiMiMo/MiMo-V2.5-tts-voicedesign","type":"text-to-speech","reported_type":"text-to-speech","description":"Automatically convert input text into natural and fluent speech output. You can generate natural and vivid speech content by configuring parameters such as speech style and voice. Automatically generate voices from text descriptions, without requiring presets or audio samples.","cover_img_url":"https://shared.deepinfra.com/models/XiaomiMiMo/MiMo-V2.5-tts-voicedesign/cover_image.b0a4e13b20b798a7471d8a51629b84d14827adb763fc5179c6705e90a12a89a5.webp","tags":["voice","no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-07T12:52:51+00:00","private":0,"is_partner":true},{"model_name":"Bria/erase_foreground","type":"text-to-image","reported_type":"text-to-image","description":"Bria Erase Foreground precisely removes main subjects or foreground objects from images. Built entirely on licensed data, it is safe and optimized for professional and commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/erase_foreground/cover_image.198b1008f4b3e382ee5dcf8c540c2a77943cdfba039e3ffa20b492812363936f.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:46+00:00","private":0,"is_partner":true},{"model_name":"mistralai/Mixtral-8x7B-Instruct-v0.1","type":"text-generation","reported_type":"text-generation","description":"Mixtral is mixture of expert large language model (LLM) from Mistral AI. This is state of the art machine learning model using a mixture 8 of experts (MoE) 7b models. During inference 2 expers are selected. This architecture allows large models to be fast and cheap at inference. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mixtral-8x7B-Instruct-v0.1/cover_image.a3146cc88bb3c77e6eae14b35d8db03d7952a597633a53378ef8182186c5a9d7.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.4e-05,"cents_per_output_token":5.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"mistralai/Mistral-Small-24B-Instruct-2501","deprecated":1778120390,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2023-12-12T04:05:36+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Embedding-4B-batch","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-4B-batch/cover_image.0d97a5a6c8c888a5d0de7fddbe265531552c2c852759bdde17ecede2618d4a25.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-Embedding-4B","deprecated":1783352675,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-30T12:26:01+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.6-T2I","type":"text-to-image","reported_type":"text-to-image","description":"Wan2.6 text to image, Upgraded visual quality, aesthetics, and instruction-following deliver precise style control, realistic portraits, long-text understanding, and broad historical/cultural IP coverage, enabling high-quality, highly expressive visual generation.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.6-T2I/cover_image.fb2f90e0277caff55bb415a29f67b4e0f38613f8580698792eefdd42da24ad76.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":3.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-18T12:41:39+00:00","private":0,"is_partner":true},{"model_name":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","type":"text-generation","reported_type":"text-generation","description":"NVIDIA-Nemotron-Nano-9B-v2 is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. It responds to user queries and tasks by first generating a reasoning trace and then concluding with a final response.  The model's reasoning capabilities can be controlled via a system prompt. If the user prefers the model to provide its final answer without intermediate reasoning traces, it can be configured to do so.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/NVIDIA-Nemotron-Nano-9B-v2/cover_image.b826f527a1598a41a5fe4d0f1dd18528b7ee57da8c13c7c67e4a610f71d8759b.webp","tags":["openai","tools","reasoning","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-06,"cents_per_output_token":1.6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"nvidia/Nemotron-3-Nano-30B-A3B","deprecated":1781217522,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-09-09T17:16:35+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3-8B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3-8B-Instruct/cover_image.9ea753fd36aabfbca4939ee488b859e08e95c4626ffff71ec3a385be66b1d3ba.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":4e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Llama-4-Scout-17B-16E-Instruct","deprecated":1778120368,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-04-18T20:39:52+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-9B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-9B is a high-performance model from Alibaba's Qwen3.5 series with a hybrid Gated Delta Networks and sparse MoE architecture. It features a 262K token context window, thinking/reasoning mode, tool calling, multi-token prediction, and support for 201 languages. Excels at reasoning, coding, instruction following, and long-context tasks.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-9B/cover_image.17a987fee6bc07013ff4a507d95dfef21f5dc2d627169d1f3952eead34719a7c.webp","tags":["openai","tools","multimodal","cc-native","json","structured-output","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:40:29+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3.1-70B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3.1-70B-Instruct/cover_image.9a5d4e05f46ed0cf09ef22a6771aa701bd3ffaadfcd68fa7ca272bc8c1c7b8e6.webp","tags":["openai","tools","no-free-anon","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1781217521,"quantization":"bfloat16","mmlu":65.08,"expected":null,"create_ts":"2024-07-23T17:55:51+00:00","private":0,"is_partner":false},{"model_name":"Bria/fibo_edit","type":"text-to-image","reported_type":"text-to-image","description":"🥳 For a limited time, Fibo Edit is free on DeepInfra 🥳  YOUR AI, YOUR RULES. Visual Generation for Production-Grade. FIBO Edit. An open-source image editing model with native masking and a lightweight 8B architecture.","cover_img_url":"https://shared.deepinfra.com/models/Bria/fibo_edit/cover_image.968982cc73b73c693372dbe6d4838fab3870d4a51b3e35bec87b593ec8edac4c.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T17:46:14+00:00","private":0,"is_partner":true},{"model_name":"mistralai/Voxtral-Small-24B-2507","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Voxtral Small is an enhancement of Mistral Small 3, incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Voxtral-Small-24B-2507/cover_image.1f3f8f4b5161abfe0b4e5921c3e5716c43524d6a4b6ba4defe1c7ce0dd7041b4.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_length","cents_per_input_sec":0.005},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":"bf16","mmlu":null,"expected":null,"create_ts":"2025-07-19T02:06:21+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","type":"text-generation","reported_type":"text-generation","description":"The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding. Llama 4 Maverick, a 17 billion parameter model with 128 experts","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8/cover_image.430267c894ab16564dce2f2e8826fbc6e15740e9defaf47c52d683a2eab9a91b.webp","tags":["ocr","openai","multimodal","b200","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":1048576,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-05T23:12:29+00:00","private":0,"is_partner":false},{"model_name":"MiniMaxAI/MiniMax-M2","type":"text-generation","reported_type":"text-generation","description":"MiniMax-M2 is a Mini model built for Max coding & agentic workflows with just 10 billion activated parameters","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M2/cover_image.5474fbcd5f4eaf2d5123e79bb02291d98597f9b2847a1e2bce182ea836d4d9d5.webp","tags":["openai","cc-native","tools","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.54e-05,"cents_per_output_token":0.000102,"rate_per_input_token_cached":0.5,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"MiniMaxAI/MiniMax-M2.5","deprecated":1776285489,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-11-11T00:06:55+00:00","private":0,"is_partner":false},{"model_name":"openai/whisper-tiny.en","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation, trained on 680k hours of labeled data without fine-tuning. It's a Transformer based encoder-decoder model, trained on English-only or multilingual data, predicting transcriptions in the same or different language as the audio. Whisper checkpoints come in five configurations of varying model sizes.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034221,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-15T22:55:04+00:00","private":0,"is_partner":false},{"model_name":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B","type":"text-generation","reported_type":"text-generation","description":"NVIDIA Nemotron 3 Super is a hybrid Mixture-of-Experts (MoE) model engineered for highest compute efficiency and accuracy in multi-agent applications and specialized agentic systems. It is optimized to run many collaborating agents per application on a single GPU, delivering high accuracy for reasoning, tool use, and instruction following.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B/cover_image.7a0c5754d70a07134fc3e1621dedcfe066a187e250c16f31a7f5f4691fd71dfe.webp","tags":["openai","tools","reasoning","json","structured-output","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8.5e-06,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-03-10T23:19:44+00:00","private":0,"is_partner":false},{"model_name":"intfloat/multilingual-e5-large-instruct","type":"embeddings","reported_type":"embeddings","description":"The Multilingual-E5 models, initialized from XLM-RoBERTa, support up to 512 tokens per input — any longer text will be silently truncated. To ensure optimal performance, always prefix inputs with “query:” or “passage:”, as the model was explicitly trained with this format.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-04-17T06:27:01+00:00","private":0,"is_partner":false},{"model_name":"openai/gpt-oss-120b-Turbo","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","tools","reasoning","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-09-29T23:15:23+00:00","private":0,"is_partner":false},{"model_name":"Sao10K/L3-8B-Lunaris-v1-Turbo","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-06,"cents_per_output_token":5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-09T23:15:17+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2-Instruct","type":"text-generation","reported_type":"text-generation","description":"Kimi K2 is a large-scale Mixture-of-Experts (MoE) language model developed by Moonshot AI, featuring 1 trillion total parameters with 32 billion active per forward pass. It is optimized for agentic capabilities, including advanced tool use, reasoning, and code synthesis. Kimi K2 excels across a broad range of benchmarks, particularly in coding (LiveCodeBench, SWE-bench), reasoning (ZebraLogic, GPQA), and tool-use (Tau2, AceBench) tasks.","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2-Instruct/cover_image.230f1c8c59fda9e70c7a1164c430fdcea4163b21e42923f2c1a58c33284b9e61.webp","tags":["openai","tools","b200","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"moonshotai/Kimi-K2.5","deprecated":1776286096,"quantization":"mixed: fp8/fp4","mmlu":null,"expected":null,"create_ts":"2025-07-14T21:34:15+00:00","private":0,"is_partner":false},{"model_name":"XiaomiMiMo/MiMo-V2.5-tts","type":"text-to-speech","reported_type":"text-to-speech","description":"Automatically convert input text into natural and fluent speech output. You can generate natural and vivid speech content by configuring parameters such as speech style and voice. Use the high-quality voices from the built-in voices list.","cover_img_url":"https://shared.deepinfra.com/models/XiaomiMiMo/MiMo-V2.5-tts/cover_image.a53c20b378615983babf56fd19bd543e1548c7a075bd49b96071bb1d3664236e.webp","tags":["voice","no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-07T07:16:18+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo","type":"text-generation","reported_type":"text-generation","description":"Qwen3-Coder-480B-A35B-Instruct is the Qwen3's most agentic code model, featuring Significant Performance on Agentic Coding, Agentic Browser-Use and other foundational coding tasks, achieving results comparable to Claude Sonnet.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo/cover_image.eda4bf16dbe8f7cb7e0de26fe0e600b3ccc1b5bb33414421fa50d7fbcb227160.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-05,"cents_per_output_token":0.0001,"rate_per_input_token_cached":0.33333333,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-07-26T00:11:23+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_increase_resolution","type":"text-to-video","reported_type":"text-to-video","description":"Increase video resolution up to 8K with advanced AI upscaling. Bring your videos to the big screen, ready for the screens of tomorrow.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_increase_resolution/cover_image.db6b42cced00675bb22b40fd3a9f6916aaf5725bfbf29029dcb159a8c9898a02.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":14.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:10:13+00:00","private":0,"is_partner":true},{"model_name":"MiniMaxAI/MiniMax-M2.5","type":"text-generation","reported_type":"text-generation","description":"MiniMax M2.5 is SOTA in coding, agentic tool use and search, office work, and a range of other economically valuable tasks, boasting scores of 80.2% in SWE-Bench Verified, 51.3% in Multi-SWE-Bench, and 76.3% in BrowseComp (with context management).","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M2.5/cover_image.edb407edc42e05fc4141db41ded751f85a804dd79db2ee9a6b8eaca69653f80b.webp","tags":["openai","tools","json","reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":0.000115,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":196608,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-02-13T16:09:28+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3.1","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3.1 is post-trained on the top of DeepSeek-V3.1-Base, which is built upon the original V3 base checkpoint through a two-phase long context extension approach, following the methodology outlined in the original DeepSeek-V3 report. We have expanded our dataset by collecting additional long documents and substantially extending both training phases. The 32K extension phase has been increased 10-fold to 630B tokens, while the 128K extension phase has been extended by 3.3x to 209B tokens. Additionally, DeepSeek-V3.1 is trained using the UE8M0 FP8 scale data format to ensure compatibility with microscaling data formats.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3.1/cover_image.b1ca5b6d9ddec813a34b4134cc8cd9011a7d736c5363eef89d40ce4d7e33516a.webp","tags":["openai","tools","reasoning","can-disable-reasoning","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.1e-05,"cents_per_output_token":7.9e-05,"rate_per_input_token_cached":0.61904762,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-08-21T21:06:35+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-R1-Turbo","type":"text-generation","reported_type":"text-generation","description":"We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks. ","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1-Turbo/cover_image.ef6fed0befbf14b90f0be63a8a12af864029c2fe7178cdab32fc801f79223e48.webp","tags":["openai","tools","b200","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0003,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":40960,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1765492249,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-02-26T08:49:20+00:00","private":0,"is_partner":false},{"model_name":"PrunaAI/p-image","type":"text-to-image","reported_type":"text-to-image","description":"P-Image is a state-of-the-art real-time generation model  with exceptional text rendering, fine-detail accuracy, and rock-solid prompt adherence. It’s built for instant creativity at high-fidelity images in about one second at a fraction of typical model costs.","cover_img_url":"https://shared.deepinfra.com/models/PrunaAI/p-image/cover_image.c31822482b2491a55a04f15056170a3f8bcea072e25bdd69e8bb1bc949fffe4b.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.5,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-12-01T12:31:08+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-Reranker-4B","type":"reranker","reported_type":"reranker","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B)","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Reranker-4B/cover_image.64532474108cce61f32c9aa7f01fc1a46e27722ca71cf49a1f8e89403c8445df.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":2.5e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-07-02T10:16:01+00:00","private":0,"is_partner":false},{"model_name":"ByteDance/Seedance-1.5-Pro","type":"text-to-video","reported_type":"text-to-video","description":"ByteDance's Seedance 1.5 Pro is a professional video model using V2A native generation for integrated, synced audio-visual output, enhancing efficiency of professional video creation.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seedance-1.5-Pro/cover_image.3a04465c7b080e6a46070c9ea13aa88e223bb62c44ee8d822f157834af15805d.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":0.00012},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-16T14:41:01+00:00","private":0,"is_partner":true},{"model_name":"mattshumer/Reflection-Llama-3.1-70B","type":"text-generation","reported_type":"text-generation","description":"Reflection Llama-3.1 70B is trained with a new technique called Reflection-Tuning that teaches a LLM to detect mistakes in its reasoning and correct course.  The model was trained on synthetic data.","cover_img_url":"https://shared.deepinfra.com/models/mattshumer/Reflection-Llama-3.1-70B/cover_image.4c02ec7b6e0967223e64318df2c2bc2d243156dc67ba770ebfced82ba718bca5.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1781217521,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-09-06T20:19:33+00:00","private":0,"is_partner":false},{"model_name":"PrunaAI/p-image-Edit","type":"text-to-image","reported_type":"text-to-image","description":"P-Image-Edit is a high-precision image editing model that applies complex transformations, insertions, removals, and style adjustments in under a second. It delivers state-of-the-art accuracy, clean boundaries, and reliable prompt alignment, making multi-step edits fast, consistent, and production-ready.","cover_img_url":"https://shared.deepinfra.com/models/PrunaAI/p-image-Edit/cover_image.0b3c9cc4fbbd639d05ca6b642dbbed31399471998ecaacdbd65663f678c70a55.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-12-01T12:31:48+00:00","private":0,"is_partner":true},{"model_name":"sentence-transformers/clip-ViT-B-32","type":"embeddings","reported_type":"embeddings","description":"The CLIP model maps text and images to a shared vector space, enabling various applications such as image search, zero-shot image classification, and image clustering. The model can be used easily after installation, and its performance is demonstrated through zero-shot ImageNet validation set accuracy scores. Multilingual versions of the model are also available for 50+ languages.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":77,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-03T03:06:35+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-opus-4-7","type":"text-generation","reported_type":"text-generation","description":"Anthropic's most capable production model yet, advancing performance across coding, enterprise workflows, and long-running agentic tasks.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-opus-4-7/cover_image.8f8781b46f9e411ae47135b0cf0aa655c509792024b39b2d94385b58d8430c07.webp","tags":["ocr","no-free-anon","multimodal","json","structured-output","openai","tools","reasoning","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0005,"cents_per_output_token":0.0025,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-13T16:54:43+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-VL-30B-A3B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-30B-A3B-Instruct/cover_image.e898ccaff47d04cf1b9585ea566a44f4e1b205b71127cacbae27be85ed8099b9.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:31:35+00:00","private":0,"is_partner":false},{"model_name":"google/embeddinggemma-300m","type":"embeddings","reported_type":"embeddings","description":"EmbeddingGemma is a 300M parameter multilingual open embedding model from Google DeepMind, designed for efficient deployment even on low-resource devices, producing high-quality text vector representations for tasks such as search, classification, clustering, and semantic similarity.","cover_img_url":"https://shared.deepinfra.com/models/google/embeddinggemma-300m/cover_image.6be517e4e4465d1529f2b0c3118ad1b57b8fef8852a2b5c744f7f8c7dbd72394.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":2e-07},"max_tokens":2048,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-16T17:12:48+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-1-schnell","type":"text-to-image","reported_type":"text-to-image","description":"FLUX.1 [schnell] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. This model offers cutting-edge output quality and competitive prompt following, matching the performance of closed source alternatives. Trained using latent adversarial diffusion distillation, FLUX.1 [schnell] can generate high-quality images in only 1 to 4 steps. ","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-1-schnell/cover_image.8c06eda005be0f68b23561357417a93122b9668358b463de8d77951fd6c931c1.webp","tags":["no-free-anon","openai","lora-base"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.05,"default_width":1024,"default_height":1024,"default_iterations":1,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-08-05T05:59:26+00:00","private":0,"is_partner":false},{"model_name":"BAAI/bge-base-en-v1.5","type":"embeddings","reported_type":"embeddings","description":"BGE embedding is a general Embedding Model. It is pre-trained using retromae and trained on large-scale pair data using contrastive learning. Note that the goal of pre-training is to reconstruct the text, and the pre-trained model cannot be used for similarity calculation directly, it needs to be fine-tuned","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T19:55:58+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_foreground_mask","type":"text-to-video","reported_type":"text-to-video","description":"Automatically identify and segment foreground objects across video frames and generate a mask. No prompts, just a video.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_foreground_mask/cover_image.00f575c2a89c7f289af83883fd39c2b4a7ca876d865a31027e1c4ff1dc48f23c.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":14.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:11:48+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/DeepSeek-Prover-V2-671B","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-Prover-V2, an open-source large language model designed for formal theorem proving in Lean 4, with initialization data collected through a recursive theorem proving pipeline powered by DeepSeek-V3. The cold-start training procedure begins by prompting DeepSeek-V3 to decompose complex problems into a series of subgoals. The proofs of resolved subgoals are synthesized into a chain-of-thought process, combined with DeepSeek-V3's step-by-step reasoning, to create an initial cold start for reinforcement learning. ","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-Prover-V2-671B/cover_image.7875a36ec7d442b8eac4f77ca3f3fd5a4a32d44787e5ff1a99b51223ffb26dc4.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.000218,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":"deepseek-ai/DeepSeek-V3-0324","deprecated":1752642068,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-30T21:00:00+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-Small-3.1-24B-Instruct-2503","type":"text-generation","reported_type":"text-generation","description":"Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and extends context capabilities up to 128K tokens while maintaining top-tier text performance. Its 24 billion parameters and instruction fine-tuning deliver fast, local deployment for both text and vision tasks.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503/cover_image.1cf18782518559ecf81d087be0a43950bb80ec70f1927adb8562029d320f712e.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":128000,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757621399,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-06-11T20:58:21+00:00","private":0,"is_partner":false},{"model_name":"inworld-ai/realtime-tts-2","type":"text-to-speech","reported_type":"text-to-speech","description":"Realtime TTS 2.0 is a low-latency text-to-speech model with natural language steering, allowing you to control tone and emotion directly in the prompt (e.g., “[be happy and upbeat] Hello!”). It supports cross-lingual voices and multiple languages, enabling the same voice to speak consistently across different languages. This is an early access preview ahead of full launch, with ongoing improvements to voice quality and steering.","cover_img_url":"https://shared.deepinfra.com/models/inworld-ai/realtime-tts-2/cover_image.2964675da8cc802bc04279cf827d983e4226f522037f359d9e626bc492b41155.webp","tags":["voice","no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0035},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-04T22:17:44+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/DeepSeek-R1","type":"text-generation","reported_type":"text-generation","description":"We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks. ","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1/cover_image.5d9c2c7f37588d87ed176a0663e51c26f6907914efce7045a0d6fbd4f47a8ad6.webp","tags":["openai","tools","b200","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-05,"cents_per_output_token":0.00024,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1765492249,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-01-22T22:43:37+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning","type":"text-generation","reported_type":"text-generation","description":"Nemotron 3 Nano Omni is an open multimodal model built on a hybrid Mixture-of-Experts (MoE) architecture, engineered for high efficiency and strong accuracy across image, video, audio, and text inputs. It powers always-on sub-agents for computer use, document intelligence, and audio-video understanding—replacing fragmented vision, speech, and language pipelines with a single unified inference pass.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning/cover_image.79804349ae0e8f83778dab614517b44905c11bbd340b57c78ab4440186c4dfb7.webp","tags":["openai","structured-output","tools","json","reasoning","multimodal","input-audio","cc-native","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-04-28T05:29:29+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V4-Flash","type":"text-generation","reported_type":"text-generation","description":"DeepSeek V4 Flash is an efficiency-focused MoE model with 284B total parameters (13B active) and a 1M-token context window. It's tuned for fast inference and high-throughput use cases while still holding up on reasoning and coding tasks.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V4-Flash/cover_image.816346e8186ef1652de4efda60bee06c74b3629aba75fd26151fc15bc69185c7.webp","tags":["structured-output","openai","tools","json","can-disable-reasoning","reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9e-06,"cents_per_output_token":1.8e-05,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":1048576,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-04-24T06:16:17+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B","type":"text-generation","reported_type":"text-generation","description":"DeepSeek R1 Distill Qwen 32B is a distilled large language model based on Qwen 2.5 32B, using outputs from DeepSeek R1. It outperforms OpenAI's o1-mini across various benchmarks, achieving new state-of-the-art results for dense models.  Other benchmark results include:  AIME 2024: 72.6 | MATH-500: 94.3 | CodeForces Rating: 1691.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/cover_image.50d8657bcdde162c58121126e03554cf12830f7d2eb80d300e804c587dbb1a73.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.7e-05,"cents_per_output_token":2.7e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1781220528,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-01-31T20:01:33+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-4-26B-A4B-it","type":"text-generation","reported_type":"text-generation","description":"Efficient, MoE variant of Gemma 4. Gemma is a family of open models built by Google DeepMind. Gemma 4 models are multimodal, handling text and image input and generating text output.","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-4-26B-A4B-it/cover_image.1e436eeb45d6d09ac7f5297bf78b53b76e776ed2769099d5b85757f99dce2ab3.webp","tags":["openai","tools","cc-native","multimodal","json","structured-output","reasoning","can-disable-reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":3.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-04-03T02:21:00+00:00","private":0,"is_partner":false},{"model_name":"PrunaAI/p-video","type":"text-to-video","reported_type":"text-to-video","description":"Real-time AI video generation from text, images, and audio. Supports up to 1080p at 48 FPS with built-in audio generation, draft mode for 4x faster previews, and prompt upsampling.","cover_img_url":"https://shared.deepinfra.com/models/PrunaAI/p-video/cover_image.bd9632c1b98a138819b131ce45aed4295a9ae6ae8652f137089ad71e597a41ec.webp","tags":["no-free-anon","openai"],"pricing":{"short":"$0.02 / second","full":"$0.02 / second for 720P, $0.04 / second for 1080P, $0.005 / second for 720P DRAFT, $0.01 / second for 1080P DRAFT","table":null,"type":"output_length","cents_per_output_sec":2.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-26T08:32:52+00:00","private":0,"is_partner":true},{"model_name":"Bria/remove_background","type":"text-to-image","reported_type":"text-to-image","description":"Bria RMBG 2.0 enables seamless removal of backgrounds from images, ideal for professional editing tasks. Trained exclusively on licensed data for safe and risk-free commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/remove_background/cover_image.f43f64c1714d08f713133968e14eb4b03bfd5155f375b6bb90b1bfb2d2231ef0.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.8,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:42+00:00","private":0,"is_partner":true},{"model_name":"meta-llama/Llama-3.3-70B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.3-70B-Instruct/cover_image.e90aef7bcab5b478ff3608e1b174f2d2e6580188adddacd74c6a24448e61a6db.webp","tags":["openai","json","tools","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.3e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"meta-llama/Llama-3.3-70B-Instruct-Turbo","deprecated":1762392581,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-12-06T17:39:34+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-235B-A22B-Instruct-2507","type":"text-generation","reported_type":"text-generation","description":"Qwen3-235B-A22B-Instruct-2507 is the updated version of the Qwen3-235B-A22B non-thinking mode, featuring Significant improvements in general capabilities, including instruction following, logical reasoning, text comprehension, mathematics, science, coding and tool usage.  ","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-235B-A22B-Instruct-2507/cover_image.fe0214ec9e1c4865a1d84f18925f599aa94bd8b3da768c500cf6b218e2ff51c5.webp","tags":["openai","json","tools","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-21T23:03:46+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-2-9b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma is a family of lightweight, state-of-the-art open models from Google. The 9B Gemma 2 model delivers class-leading performance, outperforming Llama 3 8B and other open models in its size category.","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-2-9b-it/cover_image.e81c22c5f9e87775e9e5316c038dcd4cb8ae875e513f14e665d84ee891a73f9b.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":6e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"google/gemma-3-12b-it","deprecated":1743551605,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-07-09T18:36:53+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-Guard-3-8B","type":"text-generation","reported_type":"text-generation","description":"Llama Guard 3 is a Llama-3.1-8B pretrained model, fine-tuned for content safety classification. Similar to previous versions, it can be used to classify content in both LLM inputs (prompt classification) and in LLM responses (response classification). It acts as an LLM – it generates text in its output that indicates whether a given prompt or response is safe or unsafe, and if unsafe, it also lists the content categories violated.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-Guard-3-8B/cover_image.f9e36e699c7463071ba59d356773d4f8e45337a00cb21ae784a792035c29074d.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"meta-llama/Llama-Guard-4-12B","deprecated":1762824579,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-04-18T22:35:14+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-haiku-4-5","type":"text-generation","reported_type":"text-generation","description":"The next generation of Anthropic's fastest and most cost-effective model, optimal for use cases where speed and affordability matter.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-haiku-4-5/cover_image.c2eb61836d5a85e18eed94b3aa6079b8bf3bfe448c801ab7115c210adff90c4d.webp","tags":["json","structured-output","cc-native","reasoning","ocr","openai","multimodal","no-free-anon","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0005,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":200000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-13T16:58:36+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-VL-8B-Thinking","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-8B-Thinking/cover_image.ccb94eec3e11f8c87e89b9c810109f3f59c3c947ac6a58fae33de94f209129eb.webp","tags":["openai","cc-native","multimodal","json","structured-output","reasoning","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.8e-05,"cents_per_output_token":0.000209,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762816774,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:32:04+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/cover_image.72a176f3bc06874ecc847a5743722cfdcbffa23e9d69fdd5722a0c7a5ebc214e.webp","tags":["json","tools","openai","structured-output","b200","non-reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":64.69,"expected":null,"create_ts":"2024-10-19T05:45:32+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-30B-A3B","type":"text-generation","reported_type":"text-generation","description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-30B-A3B/cover_image.099ee65844ad72c97a1e6377194dfb6f695d7164fb006609941438b88ec1b980.webp","tags":["openai","tools","reasoning","json","non-reasoning","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.2e-05,"cents_per_output_token":5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":40960,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-28T22:56:26+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V4-Pro","type":"text-generation","reported_type":"text-generation","description":"DeepSeek V4 Pro is an MoE model with 1.6T total parameters (49B active) and a 1M-token context window. It's built for advanced reasoning, coding, and long-running agent tasks, and performs well on knowledge, math, and software engineering benchmarks.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V4-Pro/cover_image.2aafc71db68ab31f66b467460c86f49a02ca7efaddfbc7a87a738d0401246143.webp","tags":["structured-output","openai","can-disable-reasoning","json","tools","non-reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00013,"cents_per_output_token":0.00026,"rate_per_input_token_cached":0.07692308,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":1048576,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-04-24T07:37:18+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-3-12b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3-12B is Google's latest open source model, successor to Gemma 2","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-3-12b-it/cover_image.cf6000ae6da004859f30be43c0bac891c88749e254c48d70030d0869ab4e59e2.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-03-28T17:57:54+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-3.2-3B-Instruct","type":"text-generation","reported_type":"text-generation","description":"The Meta Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned generative models in 1B and 3B sizes (text in/text out)","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.2-3B-Instruct/cover_image.da2730cb3d36e267a6e76a9b1b95177e26ab22853b046c4ecbb6231f49c25ce4.webp","tags":["openai","tools","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-06,"cents_per_output_token":2e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"meta-llama/Llama-3.2-11B-Vision-Instruct","deprecated":1773101029,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-09-26T23:26:30+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-2-13b-chat-hf","type":"text-generation","reported_type":"text-generation","description":"Llama 2 is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. This is the repository for the 7B fine-tuned model, optimized for dialogue use cases and converted for the Hugging Face Transformers format. ","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-2-13b-chat-hf/cover_image.6cbca6a3445de3ac6e75983f8828fe6b6323ae8a2d3c025ae1561ac5e56e6326.jpg","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.3e-05,"cents_per_output_token":1.3e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Llama-4-Scout-17B-16E-Instruct","deprecated":1778120368,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-09-29T21:27:09+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen2.5-72B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Qwen2.5 is a model pretrained on a large-scale dataset of up to 18 trillion tokens, offering significant improvements in knowledge, coding, mathematics, and instruction following compared to its predecessor Qwen2. The model also features enhanced capabilities in generating long texts, understanding structured data, and generating structured outputs, while supporting multilingual capabilities for over 29 languages.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2.5-72B-Instruct/cover_image.af6207eaf82155afe78d43fe7cda82ac439e3ce9ff08f577cd6c45c2af41f6c9.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.6e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-09-18T18:24:25+00:00","private":0,"is_partner":false},{"model_name":"Pixverse/Pixverse-6-T2V","type":"text-to-video","reported_type":"text-to-video","description":"PixVerse V6 redefines AI video by shifting from isolated generation to a unified, model-driven workflow. Key upgrades include 15-second durations at 1080p resolution and a multi-shot engine. This transition allows creators to move beyond short clips toward meaningful narrative production and professional-grade marketing assets suitable for 2026 digital distribution standards.","cover_img_url":"https://shared.deepinfra.com/models/Pixverse/Pixverse-6-T2V/cover_image.fcab6b4d7ad97f85dbc53c8dc651801e9ab433aa8b22b9b23e84041580980c0d.webp","tags":["no-free-anon","openai"],"pricing":{"short":"$0.045 / second","full":"Per-second pricing varies by quality and audio toggle. 360p: $0.025/$0.035 · 540p: $0.035/$0.045 · 720p: $0.045/$0.060 · 1080p: $0.090/$0.115 (no audio /\n  with audio).","table":null,"type":"output_length","cents_per_output_sec":4.5},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-29T13:57:27+00:00","private":0,"is_partner":true},{"model_name":"mistralai/Voxtral-Mini-3B-2507","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Voxtral Mini is an enhancement of Ministral 3B, incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Voxtral-Mini-3B-2507/cover_image.6253f2cb31c45ecaac1fbb1762685b74edc8c1f146a087fd7bd1d8bf46ef105b.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_length","cents_per_input_sec":0.00166667},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":"bf16","mmlu":null,"expected":null,"create_ts":"2025-07-19T00:56:18+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-2.5-pro","type":"text-generation","reported_type":"text-generation","description":"Gemini 2.5 Pro is Google's the most advanced thinking model, designed to tackle increasingly complex problems. Gemini 2.5 Pro leads common benchmarks by meaningful margins and showcases strong reasoning and code capabilities.  Gemini 2.5 models are thinking models, capable of reasoning through their thoughts before responding, resulting in enhanced performance and improved accuracy.  The Gemini 2.5 Pro model is now available on DeepInfra.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-2.5-pro/cover_image.66dd07a014b824c2e8527348ccd5dd5d023997e1c9c5550414f2503bcb451a3b.webp","tags":["ocr","openai","tools","reasoning","multimodal","no-free-anon","json","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.000125,"cents_per_output_token":0.001,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-04-17T21:49:31+00:00","private":0,"is_partner":true},{"model_name":"Sao10K/L3.3-70B-Euryale-v2.3","type":"text-generation","reported_type":"text-generation","description":"L3.3-70B-Euryale-v2.3 is a model focused on creative roleplay from Sao10k","cover_img_url":"https://shared.deepinfra.com/models/Sao10K/L3.3-70B-Euryale-v2.3/cover_image.3edd53f864ebe24fe632eaf96361233e514097255cc7ab9bb0d96876aece9a7b.webp","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8.5e-05,"cents_per_output_token":8.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"Sao10K/L3.1-70B-Euryale-v2.2","deprecated":1778120411,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-01-15T21:12:32+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-4B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-4B is a mid-size model from Alibaba's Qwen3.5 series that delivers a strong balance of performance and efficiency. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Well-suited for complex reasoning, code generation, and agentic applications.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-4B/cover_image.56b67da0804581143291a85fff6a3fdc02a305fbbcc1e4ad447d1999ad02e69c.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":"Qwen/Qwen3.5-9B","deprecated":1781216918,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:39:30+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.2-T2V-A14B","type":"text-to-video","reported_type":"text-to-video","description":"The Wan2.2 T2V A14B is a next-generation 14B-parameter video foundation model by Wan-AI featuring a novel two-stage denoising architecture. It produces 480P videos with improved visual coherence and detail, generating 2 or 5 second clips at 16fps from text prompts.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.2-T2V-A14B/cover_image.8c5195f1c3eb412f4c84d1149b9c342b79ae44ed1e30aa8fa7baa671367f053c.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":3.6},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-03T20:21:44+00:00","private":0,"is_partner":false},{"model_name":"ClarityAI/crystal","type":"text-to-image","reported_type":"text-to-image","description":"ClarityAI/crystal is a specialized upscaler optimized for portraits, faces, and products, delivering high-precision enhancements with adjustable detail levels for sharp, natural results.","cover_img_url":"","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":5.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-19T13:10:40+00:00","private":0,"is_partner":true},{"model_name":"XpucT/Deliberate","type":"text-to-image","reported_type":"text-to-image","description":"The Deliberate Model allows for the creation of anything desired, with the potential for better results as the user's knowledge and detail in the prompt increase. The model is ideal for meticulous anatomy artists, creative prompt writers, art designers, and those seeking explicit content.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1727456654,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-17T01:59:40+00:00","private":0,"is_partner":false},{"model_name":"NovaSky-AI/Sky-T1-32B-Preview","type":"text-generation","reported_type":"text-generation","description":"This is a 32B reasoning model trained from Qwen2.5-32B-Instruct with 17K data. The performance is on par with o1-preview model on both math and coding.","cover_img_url":"https://shared.deepinfra.com/models/NovaSky-AI/Sky-T1-32B-Preview/cover_image.50c7308479b460765575b02e9b060064817da5b62453b03eb284ddeb1aaecb79.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.2e-05,"cents_per_output_token":1.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-32B","deprecated":1753564079,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2025-01-15T00:49:25+00:00","private":0,"is_partner":false},{"model_name":"XiaomiMiMo/MiMo-V2.5-Pro","type":"text-generation","reported_type":"text-generation","description":"MiMo-V2.5-Pro is an open-source Mixture-of-Experts (MoE) language model with 1.02T total parameters and 42B active parameters. It utilizes the hybrid attention architecture and 3-layers Multi-Token Prediction (MTP) introduced in [MiMo-V2-Flash](https://github.com/XiaomiMiMo/MiMo-V2-Flash).","cover_img_url":"https://shared.deepinfra.com/models/XiaomiMiMo/MiMo-V2.5-Pro/cover_image.adf2d08a96ec44187e93130104a33c4fd1d137425d4e71b6b1fa88c68578f244.webp","tags":["openai","cc-native","tools","json","structured-output","reasoning","can-disable-reasoning","input-audio","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":1048576,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-05-03T02:03:53+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-2B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-2B is a compact yet capable model from Alibaba's Qwen3.5 series. It features a 262K token context window, support for 201 languages, thinking/reasoning mode, and tool calling for agentic workflows. A strong choice for prototyping, fine-tuning, and efficient multilingual deployments.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-2B/cover_image.c3dfd7ec6ae6a80172c0e4f09af83ff7009829c64032e7cc977f232b330fdba8.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":"Qwen/Qwen3.5-9B","deprecated":1781216920,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:39:14+00:00","private":0,"is_partner":false},{"model_name":"hexgrad/Kokoro-82M","type":"text-to-speech","reported_type":"text-to-speech","description":"Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.","cover_img_url":"https://shared.deepinfra.com/models/hexgrad/Kokoro-82M/cover_image.f1eff048596a1c36de0985c33410b862d0841ab2ed5d34b9a31c34c98017c748.webp","tags":["priority","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":6.2e-05},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-01-15T02:42:21+00:00","private":0,"is_partner":false},{"model_name":"ResembleAI/chatterbox-multilingual","type":"text-to-speech","reported_type":"text-to-speech","description":"09/04 🔥 Introducing Chatterbox Multilingual in 23 Languages!  We're excited to introduce Chatterbox and Chatterbox Multilingual, Resemble AI's production-grade open source TTS models. Chatterbox Multilingual supports Arabic, Danish, German, Greek, English, Spanish, Finnish, French, Hebrew, Hindi, Italian, Japanese, Korean, Malay, Dutch, Norwegian, Polish, Portuguese, Russian, Swedish, Swahili, Turkish, Chinese out of the box. Licensed under MIT, Chatterbox has been benchmarked against leading closed-source systems like ElevenLabs, and is consistently preferred in side-by-side evaluations.","cover_img_url":"https://shared.deepinfra.com/models/ResembleAI/chatterbox-multilingual/cover_image.7f2b51e8c7cf057e551eaeff51133af38ffceac6da485d7e6203aeef972e149b.webp","tags":["voice","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0001},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-14T06:08:44+00:00","private":0,"is_partner":false},{"model_name":"google/codegemma-7b-it","type":"text-generation","reported_type":"text-generation","description":"CodeGemma is a collection of lightweight open code models built on top of Gemma. CodeGemma models are text-to-text and text-to-code decoder-only models and are available as a 7 billion pretrained variant that specializes in code completion and code generation tasks, a 7 billion parameter instruction-tuned variant for code chat and instruction following and a 2 billion parameter pretrained variant for fast code completion.","cover_img_url":"https://shared.deepinfra.com/models/google/codegemma-7b-it/cover_image.ecf88bf320526e3a113a1e51057501ffe65b8e264912e0c920ae7f95553f74b2.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":7e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"google/gemma-3-12b-it","deprecated":1743551605,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-05-08T00:30:25+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3.1-Terminus","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3.1 Terminus is an update to DeepSeek V3.1 that maintains the model's original capabilities while addressing issues reported by users, including language consistency and agent capabilities, further optimizing the model's performance in coding and search agents. It is a large hybrid reasoning model (671B parameters, 37B active) that supports both thinking and non-thinking modes. It extends the DeepSeek-V3 base with a two-phase long-context training process. Users can control the reasoning behaviour with the reasoning enabled boolean. Learn more in our docs  The model improves tool use, code generation, and reasoning efficiency, achieving performance comparable to DeepSeek-R1 on difficult benchmarks while responding more quickly. It supports structured tool calling, code agents, and search agents, making it suitable for research, coding, and agentic workflows.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3.1-Terminus/cover_image.3379c9f5d976a957ed8f33ed3de2e5c0f5d2199d6ec74140acfdcbd9af51025f.webp","tags":["openai","tools","reasoning","can-disable-reasoning","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.7e-05,"cents_per_output_token":9.5e-05,"rate_per_input_token_cached":0.48148148,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-09-22T20:32:31+00:00","private":0,"is_partner":false},{"model_name":"openai/whisper-large-v3-turbo","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper \"Robust Speech Recognition via Large-Scale Weak Supervision\" by Alec Radford  et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation.","cover_img_url":"https://shared.deepinfra.com/models/openai/whisper-large-v3-turbo/cover_image.bab5ba9805e51b2e27172aad945f02104612869bc0e6d2c46d6bcbdf608f649a.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_length","cents_per_input_sec":0.000333},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-10-02T20:34:00+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Nemotron-4-340B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Nemotron-4-340B-Instruct is a chat model intended for use for the English language, designed for Synthetic Data Generation","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Nemotron-4-340B-Instruct/cover_image.6223332b8dfbf8dba008d030548970e7dbee4cfb571412430cc5ff8c007b7072.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00042,"cents_per_output_token":0.00042,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"NousResearch/Hermes-3-Llama-3.1-405B","deprecated":1752207171,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-21T07:23:42+00:00","private":0,"is_partner":false},{"model_name":"sentence-transformers/all-MiniLM-L6-v2","type":"embeddings","reported_type":"embeddings","description":"We present a sentence transformation model that achieves state-of-the-art results on various NLP tasks without requiring task-specific architectures or fine-tuning. Our approach leverages contrastive learning and utilizes a variety of datasets to learn robust sentence representations. We evaluate our model on several benchmarks and demonstrate its effectiveness in various applications such as text classification, sentiment analysis, named entity recognition, and question answering.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-22T09:49:10+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen-Image-Edit","type":"text-to-image","reported_type":"text-to-image","description":"Qwen-Image-Edit is a next-generation image editing model built on top of Qwen-Image, designed for both semantic and appearance-level edits. It excels at tasks like precise text modifications, style transfers, viewpoint transformations, and element adjustments while preserving overall visual consistency.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen-Image-Edit/cover_image.339f1c6c3b4e78e3176830085fd8e6d6b684c1ea94fe98e615eebc58db02bd00.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":2.5,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-08-20T13:09:29+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-35B-A3B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-35B-A3B is an efficient Mixture-of-Experts model from Alibaba's Qwen3.5 series with 35B total parameters and only 3B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Delivers strong performance on reasoning, coding, and vision-language tasks at a fraction of the compute cost.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-35B-A3B/cover_image.b5e5b6acd3eec44d7b3a7c212ad647831d696818f0cd25172f5ab68f075929ba.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":0.0001,"rate_per_input_token_cached":0.35714286,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:53:31+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen2-72B-Instruct","type":"text-generation","reported_type":"text-generation","description":"The 72 billion parameter Qwen2 excels in language understanding, multilingual capabilities, coding, mathematics, and reasoning.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2-72B-Instruct/cover_image.6771b7ff122c6ffaa9d5ae6b6ab54db0d46079a8f90441fa7770d60cfd4c6f4f.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen2.5-72B-Instruct","deprecated":1728579753,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-13T22:46:15+00:00","private":0,"is_partner":false},{"model_name":"Sao10K/L3-8B-Lunaris-v1","type":"text-generation","reported_type":"text-generation","description":"A generalist / roleplaying model merge based on Llama 3. Sao10K has carefully selected the values based on extensive personal experimentation and has fine-tuned them to create a customized recipe.","cover_img_url":"https://shared.deepinfra.com/models/Sao10K/L3-8B-Lunaris-v1/cover_image.d72dfd670089af5cbe1a16a92a538bb4d50201f12892b333b63a7f91f67536e5.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":6e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"Sao10K/L3.1-70B-Euryale-v2.2","deprecated":1743551422,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-11-20T00:48:45+00:00","private":0,"is_partner":false},{"model_name":"allenai/olmOCR-2-7B-1025","type":"text-generation","reported_type":"text-generation","description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","cover_img_url":"https://shared.deepinfra.com/models/allenai/olmOCR-2-7B-1025/cover_image.ec0402a3d89ba4e81717c1eb3b9589c5135705dd7698921e2693fc3cea47e3a6.webp","tags":["ocr","openai","multimodal","ocr-ui","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9e-06,"cents_per_output_token":1.9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120282,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-21T23:13:48+00:00","private":0,"is_partner":false},{"model_name":"ResembleAI/chatterbox","type":"text-to-speech","reported_type":"text-to-speech","description":"New model named Chatterbox by Resemble AI's first production-grade open source TTS model. Licensed under MIT, Chatterbox has been benchmarked against leading closed-source systems like ElevenLabs, and is consistently preferred in side-by-side evaluations.  Whether you're working on memes, videos, games, or AI agents, Chatterbox brings your content to life. It's also the first open source TTS model to support emotion exaggeration control, a powerful feature that makes your voices stand out.","cover_img_url":"https://shared.deepinfra.com/models/ResembleAI/chatterbox/cover_image.431f682258abfe11365bf44bb9e6b2c35ed30c431141dbbf4d2967b169854e70.webp","tags":["voice"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.001},"max_tokens":null,"replaced_by":"ResembleAI/chatterbox-turbo","deprecated":1773117389,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-06-03T01:41:24+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-4-sonnet","type":"text-generation","reported_type":"text-generation","description":"Anthropic's mid-size model with superior intelligence for high-volume uses in coding, in-depth research, agents, & more.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-4-sonnet/cover_image.abb94fa3840b2f233747c56e1877ad22a139fad1fdd205914141f03a786ca74c.webp","tags":["ocr","openai","tools","multimodal","no-free-anon","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00033,"cents_per_output_token":0.00165,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":200000,"replaced_by":"anthropic/claude-sonnet-4-6","deprecated":1779304953,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-06-12T17:54:48+00:00","private":0,"is_partner":true},{"model_name":"run-diffusion/Juggernaut-Lightning-Flux","type":"text-to-image","reported_type":"text-to-image","description":"Blazing-fast, high-quality images rendered at five times the speed of Flux. Perfect for mood boards and mass ideation, this model excels in both realism and prompt adherence.","cover_img_url":"https://shared.deepinfra.com/models/run-diffusion/Juggernaut-Lightning-Flux/cover_image.11c384abe65472e26ffc93ea9cf6552c7ef42dd8735776392eddeefa6264427c.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.9,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"black-forest-labs/FLUX-1-schnell","deprecated":1751283332,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-18T12:19:53+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-Turbo","type":"text-generation","reported_type":"text-generation","description":"The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding. Llama 4 Maverick, a 17 billion parameter model with 128 experts","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-Turbo/cover_image.a5950122fccf2e7f8ef9de11467cae7871568b670817cd1b5c2d2339db1f941d.webp","tags":["openai","b200"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","deprecated":1757628268,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-05-16T00:13:54+00:00","private":0,"is_partner":false},{"model_name":"ByteDance/Seed-1.8","type":"text-generation","reported_type":"text-generation","description":"Optimized specifically for multimodal agent scenarios. It features enhanced agent capabilities, upgraded multimodal comprehension, and more flexible context management.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seed-1.8/cover_image.cb600a23d0b2ba7499346c021edb1097501a09cf2a6ee4f0641fd76180624182.webp","tags":["openai","tools","reasoning","multimodal","no-free-anon","json","structured-output","cc-native"],"pricing":{"short":null,"full":"$0.25 in $2 out $0.05 cached <= 128K, $0.5 in $4 out $0.1 cached","table":null,"type":"tokens","cents_per_input_token":2.5e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-17T11:42:54+00:00","private":0,"is_partner":true},{"model_name":"openai/clip-vit-base-patch32","type":"zero-shot-image-classification","reported_type":"zero-shot-image-classification","description":"The CLIP model was developed by OpenAI to investigate the robustness of computer vision models. It uses a Vision Transformer architecture and was trained on a large dataset of image-caption pairs. The model shows promise in various computer vision tasks but also has limitations, including difficulties with fine-grained classification and potential biases in certain applications.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-05-02T00:26:15+00:00","private":0,"is_partner":false},{"model_name":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","type":"text-generation","reported_type":"text-generation","description":"Nemotron 3 Ultra is built for, frontier reasoning, orchestration, coding agents, deep research, and complex enterprise workflows. It delivers up to 5x faster inference and up to 30% lower cost for agentic workloads while supporting up to 1M token context.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16/cover_image.c439b51394aad5172b70ec5c7d25bfdfa4d25352cd12b774efcb9570890aa781.webp","tags":["structured-output","openai","reasoning","multimodal","json","tools","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0005,"rate_per_input_token_cached":0.3,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B","deprecated":1782496337,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-10T22:12:15+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.1-T2V-1.3B","type":"text-to-video","reported_type":"text-to-video","description":"The Wan2.1 1.3B model is a lightweight, efficient text-to-video generator. Despite its compact size, it delivers impressive performance across benchmarks and generates high-quality 480P videos.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.1-T2V-1.3B/cover_image.c6484a6ef9d6ee653bc20d668f37b40131720035cf7b7dcf064392859fc792ad.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":10.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"Wan-AI/Wan2.2-T2V-A14B","deprecated":1775771321,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-13T12:33:24+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2.5","type":"text-generation","reported_type":"text-generation","description":"Kimi K2.5 is an open-source, native multimodal agentic model built through continual pretraining on approximately 15 trillion mixed visual and text tokens atop Kimi-K2-Base. It seamlessly integrates vision and language understanding with advanced agentic capabilities, instant and thinking modes, as well as conversational and agentic paradigms.","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2.5/cover_image.01dedca75a300334c4306a669d0ff4c337db60ba1891da40d000cbfd392e19c3.webp","tags":["json","reasoning","tools","openai","structured-output","multimodal","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.5e-05,"cents_per_output_token":0.000225,"rate_per_input_token_cached":0.15555556,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-01-27T22:59:01+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-4-31B-it-turbo","type":"text-generation","reported_type":"text-generation","description":"Gemma is a family of open models built by Google DeepMind. Gemma 4 models are multimodal, handling text and image input and generating text output.","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-4-31B-it-turbo/cover_image.babd01c79288f556de1658963cbee32f654fe7e40863fb389b426390742a8d64.webp","tags":["openai","cc-native","reasoning","json","tools","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.2e-05,"cents_per_output_token":3.7e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-05-11T18:15:12+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-Small-24B-Instruct-2501","type":"text-generation","reported_type":"text-generation","description":"Mistral Small 3 is a 24B-parameter language model optimized for low-latency performance across common AI tasks. Released under the Apache 2.0 license, it features both pre-trained and instruction-tuned versions designed for efficient local deployment.  The model achieves 81% accuracy on the MMLU benchmark and performs competitively with larger models like Llama 3.3 70B and Qwen 32B, while operating at three times the speed on equivalent hardware.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-Small-24B-Instruct-2501/cover_image.5e02fbf50581c375f4d53d48811daa0b2ed4e993ee06cc55ef793868aa3db91b.webp","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":8e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-01-31T20:14:03+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-4.7","type":"text-generation","reported_type":"text-generation","description":"GLM-4.7 is a state-of-the-art, multilingual Mixture-of-Experts (MoE) language model designed for complex reasoning, agentic coding, and tool use. Building on its predecessor GLM-4.6, it delivers significant improvements across key benchmarks, including multilingual SWE-bench, Terminal Bench, and reasoning-heavy evaluations like HLE. The model features advanced \"Interleaved Thinking\" and new \"Preserved Thinking\" modes, allowing it to reason before actions and maintain consistency across long, multi-turn tasks. With 358 billion parameters, GLM-4.7 excels in generating clean code, modern UI elements, and sophisticated reasoning outputs.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.7/cover_image.9ecaa357000ea0f8c5a774a9f5197de7698690faaf842cbf3ef5694781d3f590.webp","tags":["json","can-disable-reasoning","openai","structured-output","tools","reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":0.000175,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":202752,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-12-23T02:20:26+00:00","private":0,"is_partner":false},{"model_name":"Qwen/QwQ-32B","type":"text-generation","reported_type":"text-generation","description":"QwQ is the reasoning model of the Qwen series. Compared with conventional instruction-tuned models, QwQ, which is capable of thinking and reasoning, can achieve significantly enhanced performance in downstream tasks, especially hard problems. QwQ-32B is the medium-sized reasoning model, which is capable of achieving competitive performance against state-of-the-art reasoning models, e.g., DeepSeek-R1, o1-mini.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/QwQ-32B/cover_image.bfaeb4e5817d3ad2bf6807dbaa7bca6ebea82e4a40915c80048fd770dad7a60b.webp","tags":["openai","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"Qwen/Qwen3-32B","deprecated":1762384613,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-03-07T00:07:32+00:00","private":0,"is_partner":false},{"model_name":"stabilityai/sd3.5","type":"text-to-image","reported_type":"text-to-image","description":"At 8 billion parameters, with superior quality and prompt adherence, this base model is the most powerful in the Stable Diffusion family. This model is ideal for professional use cases at 1 megapixel resolution","cover_img_url":"https://shared.deepinfra.com/models/stabilityai/sd3.5/cover_image.d4c4620d463c08653ae9e0884160ee72217486df908ca17ea3e296dba16caf99.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":6.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1761252480,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-10-22T17:55:18+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Embedding-4B","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-4B/cover_image.339f1c6c3b4e78e3176830085fd8e6d6b684c1ea94fe98e615eebc58db02bd00.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":2e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-06-10T13:04:24+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.7-I2V","type":"text-to-video","reported_type":"text-to-video","description":"Generates video content from images while stably preserving details such as subject, style, and text elements. Ensures visual consistency and information fidelity throughout dynamic transitions.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.7-I2V/cover_image.9d7da10338525cd72ba306fdc5803354c38f0443271a628cd9103c13f248cec0.webp","tags":["no-free-anon"],"pricing":{"short":"$0.10 / second","full":"$0.15 / second for 1080P, $0.10 / second for 720P","table":{"columns":["resolution","$ cost per second"],"rows":[["780P","$0.10"],["1080P","$0.15"]]},"type":"output_length","cents_per_output_sec":10.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-27T15:25:02+00:00","private":0,"is_partner":true},{"model_name":"Bria/gen_fill","type":"text-to-image","reported_type":"text-to-image","description":"Bria GenFill enables high-quality object addition or visual transformation. Trained exclusively on licensed data for safe and risk-free commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/gen_fill/cover_image.eedddedb2fafcea00daf4cf3ceabdfc6e32f966e5462993bab0b9d8c41c0f4b2.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:39+00:00","private":0,"is_partner":true},{"model_name":"Sao10K/L3-70B-Euryale-v2.1","type":"text-generation","reported_type":"text-generation","description":"Euryale 70B v2.1 is a model focused on creative roleplay from Sao10k","cover_img_url":"https://shared.deepinfra.com/models/Sao10K/L3-70B-Euryale-v2.1/cover_image.f092eab9d5027b46b76339eae05a40e0235e8481e7e0da0776fe53a509f37957.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"Sao10K/L3.1-70B-Euryale-v2.2","deprecated":1743551444,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-07-16T22:21:27+00:00","private":0,"is_partner":false},{"model_name":"microsoft/WizardLM-2-8x22B","type":"text-generation","reported_type":"text-generation","description":"WizardLM-2 8x22B is Microsoft AI's most advanced Wizard model. It demonstrates highly competitive performance compared to those leading proprietary models.","cover_img_url":"https://shared.deepinfra.com/models/microsoft/WizardLM-2-8x22B/cover_image.395b63e0d661def89bf43c88976a699b066f69208b3b58ae5cc2663693033ee8.webp","tags":["openai","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.8e-05,"cents_per_output_token":4.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":65536,"replaced_by":"deepseek-ai/DeepSeek-V3.2","deprecated":1771449048,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-04-16T07:20:07+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. ","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3/cover_image.8e84763e53c3f507b73dd67368d030a188a74df8923da52e8fea219529a73339.webp","tags":["openai","tools","b200","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.2e-05,"cents_per_output_token":8.9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-01-03T23:03:07+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-3-pro-image","type":"text-to-image","reported_type":"text-to-image","description":"Nano Banana Pro (Gemini 3 Pro Image) is designed to tackle the most challenging image generation by incorporating state-of-the-art reasoning capabilities. It is the best model for complex and multi-turn image generation and editing.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-3-pro-image/cover_image.ce46365e7d520577cc803c2b935c6745c4a6884019381b718cb21a1a9250d681.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":"560 tokens per input image, with output image costs scaling by resolution: 1120 tokens ($0.134) for 1K and 2K (roughly 1MP and 4MP), and 2000 tokens ($0.24) for 4K (roughly 16MP)","table":null,"type":"input_tokens","cents_per_input_token":0.012},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-13T04:18:26+00:00","private":0,"is_partner":true},{"model_name":"mistralai/Devstral-Small-2507","type":"text-generation","reported_type":"text-generation","description":"Devstral is an agentic LLM for software engineering tasks, making it a great choice for software engineering agents.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Devstral-Small-2507/cover_image.2ca30682c46317e1838d97ea88feffb2e3deecad1e88dfe314351e825fadd56c.webp","tags":["openai","cc-native","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":2.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":128000,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757623863,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-14T21:38:48+00:00","private":0,"is_partner":false},{"model_name":"microsoft/Phi-4-multimodal-instruct","type":"text-generation","reported_type":"text-generation","description":"Phi-4-multimodal-instruct is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following: - Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian - Vision: English - Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese","cover_img_url":"https://shared.deepinfra.com/models/microsoft/Phi-4-multimodal-instruct/cover_image.87facac507628c39df27e89fc610915c993148a31a2b580c4ceceb25861a76de.webp","tags":["openai","multimodal","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"google/gemma-3-12b-it","deprecated":1754935710,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-03-06T00:30:01+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-Guard-4-12B","type":"text-generation","reported_type":"text-generation","description":"Llama Guard 4 is a natively multimodal safety classifier with 12 billion parameters trained jointly on text and multiple images. Llama Guard 4 is a dense architecture pruned from the Llama 4 Scout pre-trained model and fine-tuned for content safety classification. Similar to previous versions, it can be used to classify content in both LLM inputs (prompt classification) and in LLM responses (response classification). It itself acts as an LLM: it generates text in its output that indicates whether a given prompt or response is safe or unsafe, and if unsafe, it also lists the content categories violated.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-Guard-4-12B/cover_image.907c11bf5e824fbfadcce17efd86eae1cd697532e8213f6fe75195aeb78622c8.webp","tags":["ocr","openai","multimodal","non-reasoning","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.8e-05,"cents_per_output_token":1.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-04-28T22:16:02+00:00","private":0,"is_partner":false},{"model_name":"google/veo-3.1-fast","type":"text-to-video","reported_type":"text-to-video","description":"Veo 3.1 is the latest text-to-video model from Google that generates high-fidelity, cinematic videos with synchronized audio from a simple text prompt. It excels at creating realistic and imaginative scenes with a deep understanding of natural language and visual dynamics.","cover_img_url":"https://shared.deepinfra.com/models/google/veo-3.1-fast/cover_image.a773baaf91598677a503558b49af3ee640e170d61955e17e0cb4017388b67fc6.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":15.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-16T12:59:50+00:00","private":0,"is_partner":true},{"model_name":"inworld-ai/realtime-tts-1.5-max","type":"text-to-speech","reported_type":"text-to-speech","description":"High-quality multilingual text-to-speech model by Inworld AI with 130+ preset voices across 15 languages. Supports voice cloning, word-level timestamps, and streaming. Optimized for natural, expressive speech with <250ms time-to-first-audio.","cover_img_url":"https://shared.deepinfra.com/models/inworld-ai/realtime-tts-1.5-max/cover_image.0a32888cec59599de81ef01cefa11051be0867f4b23cefc6239d830bc7c88555.webp","tags":["no-free-anon","voice","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.005},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-04T22:16:56+00:00","private":0,"is_partner":true},{"model_name":"XiaomiMiMo/MiMo-V2.5","type":"text-generation","reported_type":"text-generation","description":"MiMo-V2.5 is a native omnimodal model with strong agentic capabilities, supporting text, image, video, and audio understanding within a unified architecture. Built upon the MiMo-V2-Flash backbone and extended with dedicated vision and audio encoders, it delivers robust performance across multimodal perception, long-context reasoning, and agentic workflows. ","cover_img_url":"https://shared.deepinfra.com/models/XiaomiMiMo/MiMo-V2.5/cover_image.e0616f0ce48073142d5f1b718e41e072c273380261b966d46bb527c2cff0ed1d.webp","tags":["openai","cc-native","tools","json","structured-output","reasoning","can-disable-reasoning","multimodal","priority","input-video","input-audio","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-29T23:38:10+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-2.5-flash","type":"text-generation","reported_type":"text-generation","description":"Gemini 2.5 Flash is Google's latest thinking model, designed to tackle increasingly complex problems. It's capable of reasoning through their thoughts before responding, resulting in enhanced performance and improved accuracy.  Gemini 2.5 Flash: best for balancing reasoning and speed.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-2.5-flash/cover_image.025fc6ab7df7bcc5d204ce13fa050cb3d950074284c97878d464951f069348b1.webp","tags":["ocr","openai","tools","reasoning","multimodal","no-free-anon","json","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-05,"cents_per_output_token":0.00025,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-04-17T21:54:57+00:00","private":0,"is_partner":true},{"model_name":"KoboldAI/LLaMA2-13B-Tiefighter","type":"text-generation","reported_type":"text-generation","description":"LLaMA2-13B-Tiefighter is a highly creative and versatile language model, fine-tuned for storytelling, adventure, and conversational dialogue. It combines the strengths of multiple models and datasets, including retro-rodeo and choose-your-own-adventure, to generate engaging and imaginative content. With its ability to improvise and adapt to different styles and formats, Tiefighter is perfect for writers, creators, and anyone looking to spark their imagination.","cover_img_url":"https://shared.deepinfra.com/models/KoboldAI/LLaMA2-13B-Tiefighter/cover_image.f8b165af2ab7e8b4232cd25996f315fc614b1f41a4a44924a0a576599138fbff.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"Gryphe/MythoMax-L2-13b","deprecated":1732503296,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-09-27T22:32:43+00:00","private":0,"is_partner":false},{"model_name":"BAAI/bge-m3","type":"embeddings","reported_type":"embeddings","description":"BGE-M3 is a versatile text embedding model that supports multi-functionality, multi-linguality, and multi-granularity, allowing it to perform dense retrieval, multi-vector retrieval, and sparse retrieval in over 100 languages and with input sizes up to 8192 tokens. The model can be used in a retrieval pipeline with hybrid retrieval and re-ranking to achieve higher accuracy and stronger generalization capabilities. BGE-M3 has shown state-of-the-art performance on several benchmarks, including MKQA, MLDR, and NarritiveQA, and can be used as a drop-in replacement for other embedding models like DPR and BGE-v1.5.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":8192,"replaced_by":null,"deprecated":null,"quantization":"fp32","mmlu":null,"expected":null,"create_ts":"2024-05-27T09:37:02+00:00","private":0,"is_partner":false},{"model_name":"NousResearch/Hermes-3-Llama-3.1-405B","type":"text-generation","reported_type":"text-generation","description":"Hermes 3 is a cutting-edge language model that offers advanced capabilities in roleplaying, reasoning, and conversation. It's a fine-tuned version of the Llama-3.1 405B foundation model, designed to align with user needs and provide powerful control. Key features include reliable function calling, structured output, generalist assistant capabilities, and improved code generation. Hermes 3 is competitive with Llama-3.1 Instruct models, with its own strengths and weaknesses.","cover_img_url":"https://shared.deepinfra.com/models/NousResearch/Hermes-3-Llama-3.1-405B/cover_image.cab5884e32bf5cb0ecd5410224ae1652a74804a027f999c319b820e596746219.webp","tags":["openai","no-free-anon","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0001,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-10-09T19:43:35+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-3-4b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3-12B is Google's latest open source model, successor to Gemma 2","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-3-4b-it/cover_image.48d9c8853fb7236659026bd1467cbb50c1d929aeb9cadb14d2e74f3431939d9c.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-03-28T17:59:44+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-1.5-flash","type":"text-generation","reported_type":"text-generation","description":"Gemini 1.5 Flash is Google's foundation model that performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video. It's adept at processing visual and text inputs such as photographs, documents, infographics, and screenshots.  Gemini 1.5 Flash is designed for high-volume, high-frequency tasks where cost and latency matter. ","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-1.5-flash/cover_image.bc0b4766f6838c2ff1f9ca79b5ceb03c0350159222abac32a37ba6560e0ecc41.webp","tags":["openai","no-free-anon","multimodal","tools","cc-native","json","structured-output","ocr"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7.5e-06,"cents_per_output_token":3e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":1749069072,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-05T23:22:02+00:00","private":0,"is_partner":true},{"model_name":"Qwen/QVQ-72B-Preview","type":"text-generation","reported_type":"text-generation","description":"QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities. QVQ-72B-Preview has achieved remarkable performance on various benchmarks. It scored a remarkable 70.3% on the Multimodal Massive Multi-task Understanding (MMMU) benchmark","cover_img_url":"https://shared.deepinfra.com/models/Qwen/QVQ-72B-Preview/cover_image.6f5a66729eafd8453844335078fd14a5a9f3d53b538a8f8c50356ee3d176975c.webp","tags":["openai","cc-native","multimodal"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.5e-05,"cents_per_output_token":5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32000,"replaced_by":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","deprecated":1757621478,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-12-24T22:12:25+00:00","private":0,"is_partner":false},{"model_name":"MiniMaxAI/MiniMax-M2.7","type":"text-generation","reported_type":"text-generation","description":"MiniMax-M2.7 is MiniMax's first model deeply participating in its own evolution. M2.7 is capable of building complex agent harnesses and completing highly elaborate productivity tasks, leveraging Agent Teams, complex Skills, and dynamic tool search. ","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M2.7/cover_image.3804eb1bb390b9eec47841c67ba9aad2e9042533435a5dc77f744bef5a843d64.webp","tags":["json","tools","reasoning","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.5e-05,"cents_per_output_token":0.0001,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":196608,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-05-15T18:13:45+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-3.5-flash","type":"text-generation","reported_type":"text-generation","description":"Gemini 3.5 Flash delivers near-Pro intelligence at Flash-tier cost and speed: Pro-level coding proficiency, parallel agentic execution, all at a much lower price.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-3.5-flash/cover_image.db12bcf5eae7b129cd2d01bf80aafc0ae718c0c3688b07a218cfdc674d6b7e6b.webp","tags":["json","openai","multimodal","no-free-anon","tools","ocr","reasoning","cc-native","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00015,"cents_per_output_token":0.0009,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-28T15:01:50+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/DeepSeek-OCR","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-OCR as an initial investigation into the feasibility of compressing long contexts via optical 2D mapping. DeepSeek-OCR consists of two components: DeepEncoder and DeepSeek3B-MoE-A570M as the decoder. Specifically, DeepEncoder serves as the core engine, designed to maintain low activations under high-resolution input while achieving high compression ratios to ensure an optimal and manageable number of vision tokens. Experiments show that when the number of text tokens is within 10 times that of vision tokens (i.e., a compression ratio < 10x), the model can achieve decoding (OCR) precision of 97%. Even at a compression ratio of 20x, the OCR accuracy still remains at about 60%. This shows considerable promise for research areas such as historical long-context compression and memory forgetting mechanisms in LLMs.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-OCR/cover_image.7d9f0a913ae84c6ea85cfdf0f9410b7256b38f92e85c852c1bd85872f5914f0a.webp","tags":["ocr","openai","multimodal","ocr-ui","non-reasoning","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":1e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"google/gemma-4-31B-it","deprecated":1778122492,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-10-23T21:57:57+00:00","private":0,"is_partner":false},{"model_name":"NousResearch/Hermes-3-Llama-3.1-70B","type":"text-generation","reported_type":"text-generation","description":"Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board.","cover_img_url":"https://shared.deepinfra.com/models/NousResearch/Hermes-3-Llama-3.1-70B/cover_image.99114748d1a7e2336ecc6b22173533b7fdb122f7c41d13d46c42ccb016b1ed7b.webp","tags":["openai","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-05,"cents_per_output_token":7e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-06-30T21:23:24+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.1-T2V-14B","type":"text-to-video","reported_type":"text-to-video","description":"The Wan2.1 14B model is a high-capacity, state-of-the-art video foundation model capable of producing both 480P and 720P videos. It excels at capturing complex prompts and generating visually rich, detailed scenes, making it ideal for high-end creative tasks.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.1-T2V-14B/cover_image.c6484a6ef9d6ee653bc20d668f37b40131720035cf7b7dcf064392859fc792ad.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":40.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"Wan-AI/Wan2.2-T2V-A14B","deprecated":1775771311,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-13T14:11:43+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_eraser","type":"text-to-video","reported_type":"text-to-video","description":"Remove unwanted objects or regions from video using a mask, reconstructs the background with intelligent content-aware fill.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_eraser/cover_image.3dc38dab9bfd7440e7d2a14a2b4b3666852b2622a2e0676e314c2356756b5948.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":14.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:06:34+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/DeepSeek-R1-0528-Turbo","type":"text-generation","reported_type":"text-generation","description":"The DeepSeek R1 0528 turbo model is a state of the art reasoning model that can generate very quick responses","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1-0528-Turbo/cover_image.c0d90adf386ad20a13ef135dd16a5381140583dece7eaf22b7271385c6e8769f.webp","tags":["openai","tools","reasoning","b200","json","non-reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0003,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1779400267,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-06-16T22:50:52+00:00","private":0,"is_partner":false},{"model_name":"Bria/fibo","type":"text-to-image","reported_type":"text-to-image","description":"FIBO is an open-source, JSON-native text-to-image model trained on detailed structured descriptions (over 1,000+ words per image), providing fine-grained control over light, composition, and camera parameters.","cover_img_url":"https://shared.deepinfra.com/models/Bria/fibo/cover_image.0eec2871473637a613acedeb8ca225555cad8885603c72a1fe2ca551ef61497b.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-11-05T16:22:25+00:00","private":0,"is_partner":true},{"model_name":"Bria/replace_background","type":"text-to-image","reported_type":"text-to-image","description":"Bria Background Generation allows for efficient swapping of backgrounds in images via text prompts or reference image, delivering realistic and polished results. Trained exclusively on licensed data for safe and risk-free commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/replace_background/cover_image.457c7b12b04530ea6b07713c1cfc4edc02e06f3b2d9dd1490c9bbe0f5b766c74.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:44+00:00","private":0,"is_partner":true},{"model_name":"sentence-transformers/all-mpnet-base-v2","type":"embeddings","reported_type":"embeddings","description":"A sentence transformation model that has been trained on a wide range of datasets, including but not limited to S2ORC, WikiAnwers, PAQ, Stack Exchange, and Yahoo! Answers. Our model can be used for various NLP tasks such as clustering, sentiment analysis, and question answering.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-28T17:00:59+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-VL-235B-A22B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-235B-A22B-Instruct/cover_image.188ae740a50d6f2f48bed81f834ffbe2fe5a0f38bd96dbd312ef804c18aad0c4.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":8.8e-05,"rate_per_input_token_cached":0.55,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:31:14+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.6-27B","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.6-27B/cover_image.92926bd9d200e5fdc4a12c0628cb5705fa45a1e1dc1c953768f53ef8f078ec6e.webp","tags":["input-video","json","multimodal","reasoning","tools","cc-native","structured-output","openai","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.2e-05,"cents_per_output_token":0.00032,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-04-30T11:01:07+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3-70B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Model Details Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3-70B-Instruct/cover_image.bcffae761540e7dd36aea32e2a576690d43592a0fc39b9edbe83a5420758aabf.webp","tags":["openai","tools","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1757621725,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-04-18T23:05:24+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-27B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-27B is Alibaba's largest dense Qwen3.5 model, delivering near-frontier quality across reasoning, coding, and instruction following. It features a 262K token context window (extensible to 1M), thinking/reasoning mode, tool calling, multi-token prediction, and support for 201 languages. Best suited for production deployments and complex enterprise tasks requiring top-tier performance.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-27B/cover_image.e7b88fccc8aba7f2a7626bbe6143f7eb4a844a635a2e97982ab7dcb35ad4138a.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.6e-05,"cents_per_output_token":0.00026,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:50:54+00:00","private":0,"is_partner":false},{"model_name":"ACE-Step/acestep-v15-xl-sft","type":"text-to-music","reported_type":"text-to-music","description":"ACE-Step v1.5 is a powerful open-source music foundation model that turns a text prompt into a complete song — vocals, lyrics, and instrumentation — at quality that rivals commercial tools. We run the high-quality XL checkpoint with its planning step  (\"thinking\") on by default, so generations favor musical structure and coherence over raw speed.","cover_img_url":"https://shared.deepinfra.com/models/ACE-Step/acestep-v15-xl-sft/cover_image.e3cb8892d2024f76bb9245293409dae8e22b6684447b937458386cce64294e35.webp","tags":[],"pricing":{"short":"$0.001 / second of audio","full":"$0.001 per second of generated music","table":null,"type":"output_length","cents_per_output_sec":0.1},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-08T23:43:10+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.6-35B-A3B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.6-35B-A3B is Alibaba's latest flagship Mixture-of-Experts model, with 35B total parameters and only 3B activated per token (256 experts, 8 routed + 1 shared). Built on direct feedback from the community, Qwen3.6 prioritizes stability and real-world utility, offering developers a more intuitive, responsive, and genuinely productive coding experience.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.6-35B-A3B/cover_image.3200195ee9698ed432d45ce24936ebe99803d89bcdfed0dfd93d1ec8eece59ae.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":9.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-04-18T03:46:06+00:00","private":0,"is_partner":false},{"model_name":"inworld-ai/inworld-tts-1.5-mini","type":"text-to-speech","reported_type":"text-to-speech","description":"Fast multilingual text-to-speech model by Inworld AI with 130+ preset voices across 15 languages. Supports voice cloning, word-level timestamps, and streaming. Optimized for low-latency applications with <130ms time-to-first-audio.","cover_img_url":"https://shared.deepinfra.com/models/inworld-ai/inworld-tts-1.5-mini/cover_image.4949ffe99638b5877cd72f411b5d9603d5b3b084fecad8cd72a0b4f5e63125d3.webp","tags":["voice","no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0025},"max_tokens":null,"replaced_by":"inworld-ai/realtime-tts-1.5-mini","deprecated":1778020031,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-13T00:07:47+00:00","private":0,"is_partner":true},{"model_name":"cognitivecomputations/dolphin-2.6-mixtral-8x7b","type":"text-generation","reported_type":"text-generation","description":"The Dolphin 2.6 Mixtral 8x7b model is a finetuned version of the Mixtral-8x7b model, trained on a variety of data including coding data, for 3 days on 4 A100 GPUs. It is uncensored and requires trust_remote_code. The model is very obedient and good at coding, but not DPO tuned. The dataset has been filtered for alignment and bias. The model is compliant with user requests and can be used for various purposes such as generating code or engaging in general chat.","cover_img_url":"https://shared.deepinfra.com/models/cognitivecomputations/dolphin-2.6-mixtral-8x7b/cover_image.b265207e1a422c62c06f23a86e6ef6e8ee326de40a24bb1c5d9f102c1f2acd6b.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.4e-05,"cents_per_output_token":2.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen2.5-72B-Instruct","deprecated":1728579753,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-01-02T16:19:09+00:00","private":0,"is_partner":false},{"model_name":"run-diffusion/Juggernaut-Flux","type":"text-to-image","reported_type":"text-to-image","description":"A drop-in replacement for Flux [Dev] that delivers sharper details, richer colors, and enhanced realism, while instantly boosting LoRAs and LyCORIS with full compatibility.","cover_img_url":"https://shared.deepinfra.com/models/run-diffusion/Juggernaut-Flux/cover_image.c7c1e14f16e7a5071b6c3ac0a8f5d1cbade2a62894d71cd64007568593561ff2.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.9,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"black-forest-labs/FLUX-1-dev","deprecated":1751283332,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-18T12:09:32+00:00","private":0,"is_partner":false},{"model_name":"Bria/expand","type":"text-to-image","reported_type":"text-to-image","description":"Bria Expand expands images beyond their borders in high quality. Resizing the image by generating new pixels to expand to the desired aspect ratio. Trained exclusively on licensed data for safe and risk-free commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/expand/cover_image.911cc51de6d8ca73a2e2644cb3ee3de48de2dab28ca559ba17e4c306f83b1dae.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T10:02:57+00:00","private":0,"is_partner":true},{"model_name":"Wan-AI/Wan2.6-T2V","type":"text-to-video","reported_type":"text-to-video","description":"Turn any prompt into a smooth video. Intelligent shot scheduling supports multi-shot storytelling, generating multi-shot narrative videos with consistent subjects, scenes, and atmosphere","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.6-T2V/cover_image.c49881bbfb3cab02ce7ce68c1de480c2994f8d95bfe781b473a9f701a7001232.webp","tags":["no-free-anon","openai"],"pricing":{"short":"$0.10 / second","full":"$0.15 / second for 1080P, $0.10 / second for 720P","table":{"columns":["resolution","$ cost per second"],"rows":[["780P","$0.10"],["1080P","$0.15"]]},"type":"output_length","cents_per_output_sec":10.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-05T16:19:47+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen2.5-Coder-7B","type":"text-generation","reported_type":"text-generation","description":"Qwen2.5-Coder-7B is a powerful code-specific large language model with 7.61 billion parameters. It's designed for code generation, reasoning, and fixing tasks. The model covers 92 programming languages and has been trained on 5.5 trillion tokens of data, including source code, text-code grounding, and synthetic data.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2.5-Coder-7B/cover_image.886b8a41543e9a43e8580a96ed18e0ec309b466ca6d744c1900e5ecffddc26f9.webp","tags":["openai","completion"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.5e-06,"cents_per_output_token":5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-14B","deprecated":1762392581,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-09-20T21:37:50+00:00","private":0,"is_partner":false},{"model_name":"BAAI/bge-en-icl","type":"embeddings","reported_type":"embeddings","description":"A LLM-based embedding model with in-context learning capabilities that achieves SOTA performance on BEIR and AIR-Bench. It leverages few-shot examples to enhance task performance.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":8192,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-01-31T13:17:09+00:00","private":0,"is_partner":false},{"model_name":"FastVideo/FastWan-QAD-FP8-1.3B","type":"text-to-video","reported_type":"text-to-video","description":"A fast, compact 480p text-to-video model — 5-second clips (landscape or portrait) from a text prompt. A 3-step, FP8 quantization-aware distillation of Wan2.1-T2V-1.3B by FastVideo (Hao AI Lab).","cover_img_url":"https://shared.deepinfra.com/models/FastVideo/FastWan-QAD-FP8-1.3B/cover_image.94fa25fd2ff2bad491428d2849424bc93c8640c97a477a86957265cc6ec5c21d.webp","tags":[],"pricing":{"short":"$0.0025 / second (480p)","full":null,"table":null,"type":"output_length","cents_per_output_sec":0.25},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-27T00:57:01+00:00","private":0,"is_partner":false},{"model_name":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL","type":"text-generation","reported_type":"text-generation","description":"NVIDIA Nemotron 2 Nano VL extends the Nemotron family into multi-modal reasoning and document intelligence. This auto-regressive vision-language model enables multi-image reasoning, video understanding, visual Q&A and document analysis and summarization. Optimized for enterprise AI workflows, it powers multimodal agentic systems such as visual copilots, document assistants, and knowledge automation pipelines.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL/cover_image.158b8bcf1b41d1bc19f4df870194fefbc65d75dc14a8cc611d225711dc1994dd.webp","tags":["ocr","openai","reasoning","multimodal","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning","deprecated":1778120330,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-28T19:24:26+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.6-Image-Edit","type":"text-to-image","reported_type":"text-to-image","description":"An all-round image generation model that supports joint text–image reasoning, multi-image creative fusion, commercial-grade consistency, aesthetic style transfer, and precise control of framing and lighting, significantly enhancing consistency, controllability, and expressiveness in image generation.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.6-Image-Edit/cover_image.4e3c85c253a6dce4d0ca542cdbfcba3ac0281917f724d0b27502c7e9c5283ede.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":3.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-18T12:43:32+00:00","private":0,"is_partner":true},{"model_name":"black-forest-labs/FLUX-2-klein-4b","type":"text-to-image","reported_type":"text-to-image","description":"The fastest model of the Flux 2 family. Frontier visual intelligence — state-of-the-art image generation and editing from Black Forest Labs","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-2-klein-4b/cover_image.8508e1fb8e6c82d94849eab2961498d340382de240f592f7c5514d0131961347.webp","tags":["openai","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.4,"default_width":1024,"default_height":1024,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-20T15:37:50+00:00","private":0,"is_partner":false},{"model_name":"Bria/Bria-3.2-vector","type":"text-to-image","reported_type":"text-to-image","description":"Bria 3.2 is the next-generation commercial-ready text-to-image model. With just 4 billion parameters, it provides exceptional aesthetics and text rendering, evaluated to be on par to leading open-source models, and outperforming other licensed models.","cover_img_url":"https://shared.deepinfra.com/models/Bria/Bria-3.2-vector/cover_image.dd10c4ee7c41807e7ac37c88f33c7f8f422553f76f09374e3ce2dcd7daa2fa57.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-11T12:52:20+00:00","private":0,"is_partner":true},{"model_name":"ByteDance/Seedream-4.5","type":"text-to-image","reported_type":"text-to-image","description":"The latest image model, delivering better editing consistency, improved multi-image fusion, finer detail control, natural small text and faces, and harmonious, aesthetic visuals.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seedream-4.5/cover_image.633808cbc1fec2c06d29c16089ae8613e95459cd5e73f9ce9bb63d728315182c.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-16T14:36:14+00:00","private":0,"is_partner":true},{"model_name":"black-forest-labs/FLUX-2-pro","type":"text-to-image","reported_type":"text-to-image","description":"Multi-reference visual intelligence with unprecedented detail, color precision, and spatial reasoning.  The most advanced image generation and editing model. Generate photorealistic images with precise control.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-2-pro/cover_image.507c610a18708c5b7f04c1eb00fc961fcbccf17e9b13db0e88102cb144cb1ec8.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.5,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":true},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-11-28T14:50:34+00:00","private":0,"is_partner":true},{"model_name":"Phind/Phind-CodeLlama-34B-v2","type":"text-generation","reported_type":"text-generation","description":"Phind-CodeLlama-34B-v2 is an open-source language model that has been fine-tuned on 1.5B tokens of high-quality programming-related data and achieved a pass@1 rate of 73.8% on HumanEval. It is multi-lingual and proficient in Python, C/C++, TypeScript, Java, and more. It has been trained on a proprietary dataset of instruction-answer pairs instead of code completion examples.  The model is instruction-tuned on the Alpaca/Vicuna format to be steerable and easy-to-use. It accepts the Alpaca/Vicuna instruction format and can generate one completion for each prompt.","cover_img_url":"https://shared.deepinfra.com/models/Phind/Phind-CodeLlama-34B-v2/cover_image.0d7cb500b84d00e46b7bf490b75cf8eda73a3ad775fa4360c8deba541c3349b3.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1781217521,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-11-20T16:36:30+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.7-Max","type":"text-generation","reported_type":"text-generation","description":"The largest and most capable in the Qwen3.7 series. Qwen3.7 is a next‑generation flagship model designed for the agent‑centric.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.7-Max/cover_image.ca11accb7c4e1cd630f411727764a17290246e5c6e7be1ec26cecd6b98427818.webp","tags":["no-free-anon","json","cc-native","non-reasoning","structured-output","tools","openai"],"pricing":{"short":"$2.50 in $7.50 out $0.50 cached / 1M tokens","full":"$2.50 in $7.50 out $0.50 cached <= 32K, $5.0 in $15 out $1.0 cached <= 128K, $6.25 in $18.50 out $1.25 cached > 128K","table":null,"type":"tokens","cents_per_input_token":0.00025,"cents_per_output_token":0.00075,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-28T14:30:21+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/Janus-Pro-1B","type":"text-to-image","reported_type":"text-to-image","description":"Janus-Pro is a novel autoregressive framework that unifies multimodal understanding and generation. It addresses the limitations of previous approaches by decoupling visual encoding into separate pathways, while still utilizing a single, unified transformer architecture for processing. The decoupling not only alleviates the conflict between the visual encoder’s roles in understanding and generation, but also enhances the framework’s flexibility. Janus-Pro surpasses previous unified model and matches or exceeds the performance of task-specific models. The simplicity, high flexibility, and effectiveness of Janus-Pro make it a strong candidate for next-generation unified multimodal models.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/Janus-Pro-1B/cover_image.9040348a9ed9f973faa369ded0ee524f7a09c93d28a6487d2616f05d3a62d84b.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.05,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-11T11:40:00+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-4.6V","type":"text-generation","reported_type":"text-generation","description":"This model is part of the GLM-V family of models, introduced in the paper GLM-4.1V-Thinking and GLM-4.5V: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.6V/cover_image.9d5fb1cfbe62bc3893fc21a491e6f563c8c2efa15becca3b86fa731331a9b391.webp","tags":["openai","tools","reasoning","multimodal","can-disable-reasoning","json","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-05,"cents_per_output_token":9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"google/gemma-4-26B-A4B-it","deprecated":1777582426,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-12-08T22:26:17+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-7B-Instruct-v0.2","type":"text-generation","reported_type":"text-generation","description":"The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is a instruct fine-tuned version of the Mistral-7B-v0.2 generative text model using a variety of publicly available conversation datasets.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-7B-Instruct-v0.2/cover_image.429fef8a2a09e5c4104ede511db12beaea9d3917c4754d709b05a65d5d6f6c1f.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757952363,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-03-28T00:06:32+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-TTS-VoiceDesign","type":"text-to-speech","reported_type":"text-to-speech","description":"● Qwen3-TTS-VoiceDesign is a voice design variant of Qwen3-TTS by Alibaba's Qwen team. Instead of selecting from preset voices, you describe the voice you want in natural language — and the model generates speech in that voice.                                                                                                                                                                                                                                                                     Key capabilities:                                                                                                                                                                                                                                  - Natural language voice control — describe any voice with free text (e.g. \"a deep male voice with a calm, authoritative presence\", \"a young cheerful female with a warm and friendly tone\")   - 10 languages — English, Chinese, Japanese, Korean, German, French, Russian, Spanish, Italian, Portuguese                                                                                                                                         - Streaming support — real-time PCM streaming   - Multiple output formats — WAV, MP3, FLAC, PCM    Built on the same 1.7B parameter architecture as Qwen3-TTS, using discrete multi-codebook language modeling and a custom 12Hz acoustic tokenizer for high-quality end-to-end speech synthesis.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-TTS-VoiceDesign/cover_image.610c07ba62315fdf538bea67abfe252be6559e0bf3050d6913b5f886679a68e7.webp","tags":["openai","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.002},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-06T23:07:14+00:00","private":0,"is_partner":false},{"model_name":"openai/gpt-oss-120b","type":"text-generation","reported_type":"text-generation","description":"gpt-oss-120b is an open-weight, 117B-parameter Mixture-of-Experts (MoE) language model from OpenAI designed for high-reasoning, agentic, and general-purpose production use cases. The model supports configurable reasoning depth, full chain-of-thought access, and native tool use, including function calling, browsing, and structured output generation.","cover_img_url":"https://shared.deepinfra.com/models/openai/gpt-oss-120b/cover_image.a422aaee1b1d6446e2ccb86489f79eb77a2143a4d2eef7f4e32a314679cc204a.webp","tags":["openai","tools","reasoning","structured-output","json","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.9e-06,"cents_per_output_token":1.7e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-08-05T21:57:08+00:00","private":0,"is_partner":false},{"model_name":"BAAI/bge-large-en-v1.5","type":"embeddings","reported_type":"embeddings","description":"BGE embedding is a general Embedding Model. It is pre-trained using retromae and trained on large-scale pair data using contrastive learning. Note that the goal of pre-training is to reconstruct the text, and the pre-trained model cannot be used for similarity calculation directly, it needs to be fine-tuned","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T19:31:18+00:00","private":0,"is_partner":false},{"model_name":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B","type":"text-generation","reported_type":"text-generation","description":"Nemotron 3 Ultra is built for, frontier reasoning, orchestration, coding agents, deep research, and complex enterprise workflows. It delivers up to 5x faster inference and up to 30% lower cost for agentic workloads while supporting up to 1M token context.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B/cover_image.1d5dd34784ef0e685621b29619e36e0043ed9532a4740293fb9131a482315a5b.webp","tags":["openai","multimodal","reasoning","tools","json","structured-output","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.00022,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-05-16T00:11:05+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3.2","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3.2 is a large language model designed to harmonize high computational efficiency with strong reasoning and agentic tool-use performance. It introduces DeepSeek Sparse Attention (DSA), a fine-grained sparse attention mechanism that reduces training and inference cost while preserving quality in long-context scenarios. A scalable reinforcement learning post-training framework further improves reasoning, with reported performance in the GPT-5 class, and the model has demonstrated gold-medal results on the 2025 IMO and IOI. V3.2 also uses a large-scale agentic task synthesis pipeline to better integrate reasoning into tool-use settings, boosting compliance and generalization in interactive environments.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3.2/cover_image.600309da9c26caecc8dd1c1b2cdfe869d5e0cb2d5f19bb6f6325b6e59e358509.webp","tags":["openai","tools","can-disable-reasoning","json","non-reasoning","structured-output","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.6e-05,"cents_per_output_token":3.8e-05,"rate_per_input_token_cached":0.5,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-12-02T20:21:16+00:00","private":0,"is_partner":false},{"model_name":"openai/whisper-base","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. It was trained on 680k hours of labelled data and demonstrates a strong ability to generalize to many datasets and domains without fine-tuning.  The model is based on a Transformer encoder-decoder architecture.  Whisper models are available for various languages including English, Spanish, French, German, Italian, Portuguese, Russian, Chinese, Japanese, Korean, and many more.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034220,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-15T19:58:34+00:00","private":0,"is_partner":false},{"model_name":"openchat/openchat_3.5","type":"text-generation","reported_type":"text-generation","description":"OpenChat is a library of open-source language models that have been fine-tuned with C-RLFT, a strategy inspired by offline reinforcement learning. These models can learn from mixed-quality data without preference labels and have achieved exceptional performance comparable to ChatGPT. The developers of OpenChat are dedicated to creating a high-performance, commercially viable, open-source large language model and are continuously making progress towards this goal.","cover_img_url":"https://shared.deepinfra.com/models/openchat/openchat_3.5/cover_image.6112a8e07a704c30bd7c354351fa79c13904d9df7667a0064fb6b30bc80e728b.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Meta-Llama-3.1-8B-Instruct","deprecated":1743551004,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-11-20T11:34:04+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3-0324","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3-0324, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token, an improved iteration over DeepSeek-V3.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3-0324/cover_image.2132971065034c302886aaa95cfd9661a96989fbe12ec2d93e78ac4c831bf66d.webp","tags":["openai","tools","b200","json","non-reasoning","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.4e-05,"cents_per_output_token":9e-05,"rate_per_input_token_cached":0.5625,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-03-24T19:47:05+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Devstral-Small-2505","type":"text-generation","reported_type":"text-generation","description":"Devstral is an agentic LLM for software engineering tasks. Devstral excels at using tools to explore codebases, editing multiple files and power software engineering agents. ","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Devstral-Small-2505/cover_image.7f4d4eef714d466c1287bec36616939a0d8f255b1ed36d80b17410eae66bdbaf.webp","tags":["openai","cc-native","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-06,"cents_per_output_token":1.2e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":128000,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757623863,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-05-28T21:52:41+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.6-I2V","type":"text-to-video","reported_type":"text-to-video","description":"Turn any image into a video. Intelligent shot scheduling supports multi-shot storytelling, generating multi-shot narrative videos with consistent subjects, scenes, and atmosphere","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.6-I2V/cover_image.e4ec27e3c7e10637b3c511bbc6720a9527912369b0943a893439f78fae703af9.webp","tags":["no-free-anon"],"pricing":{"short":"$0.10 / second","full":"$0.15 / second for 1080P, $0.10 / second for 720P","table":{"columns":["resolution","$ cost per second"],"rows":[["780P","$0.10"],["1080P","$0.15"]]},"type":"output_length","cents_per_output_sec":10.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-04T17:13:35+00:00","private":0,"is_partner":true},{"model_name":"ByteDance/Seedance-2.0","type":"text-to-video","reported_type":"text-to-video","description":"A new-generation professional-grade multimodal video creation model developed, supports video generation with multimodal reference inputs including images, videos and audio.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seedance-2.0/cover_image.501370619f0554dc4c09a5ddc365f0a022ad2b4aab2f1f41bb491059764fb217.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":"$4.3/M with video, $7/M without for 480p and 780p; $4.7/M with video, $7.7/M without for 1080p","table":null,"type":"input_tokens","cents_per_input_token":0.00043},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-29T11:56:41+00:00","private":0,"is_partner":true},{"model_name":"Bria/blur_background","type":"text-to-image","reported_type":"text-to-image","description":"Bria Blur Background softens and de-emphasizes image backgrounds while keeping the subject sharp and clear for professional-quality results. Trained fully on licensed data, it delivers safe, natural, and commercial-ready outputs.","cover_img_url":"https://shared.deepinfra.com/models/Bria/blur_background/cover_image.d212a8501879df8cb3b427f5bc0f1611693b4fdd9927226dda44057aec909095.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:49+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-Embedding-8B-batch","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-8B-batch/cover_image.0d97a5a6c8c888a5d0de7fddbe265531552c2c852759bdde17ecede2618d4a25.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":4e-06},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-Embedding-8B","deprecated":1783352672,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-30T12:26:08+00:00","private":0,"is_partner":false},{"model_name":"allenai/olmOCR-7B-1025","type":"text-generation","reported_type":"text-generation","description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","cover_img_url":"https://shared.deepinfra.com/models/allenai/olmOCR-7B-1025/cover_image.d799a8c20e5d9d447bf175e60095197761ced1e47e4e3731b0b3d58e25be0256.webp","tags":["openai","multimodal","cc-native","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120282,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-17T23:33:57+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-235B-A22B-Thinking-2507","type":"text-generation","reported_type":"text-generation","description":"Qwen3-235B-A22B-Thinking-2507 is the Qwen3's new model with scaling the thinking capability of Qwen3-235B-A22B, improving both the quality and depth of reasoning. ","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-235B-A22B-Thinking-2507/cover_image.2b87c6645946388f890b18b46894e32d140913d73154eb6ae428e378916853c3.webp","tags":["openai","tools","json","reasoning","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.3e-05,"cents_per_output_token":0.00023,"rate_per_input_token_cached":0.86956522,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-25T18:22:07+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-2-27b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma is a family of lightweight, state-of-the-art open models from Google. Gemma-2-27B delivers the best performance for its size class, and even offers competitive alternatives to models more than twice its size. ","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-2-27b-it/cover_image.7278882b03ca6732cf6374aaab42734a2ede6343b1b89421f15ad0fc27e3936c.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.7e-05,"cents_per_output_token":2.7e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"google/gemma-3-27b-it","deprecated":1743551250,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-07-13T00:24:26+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-4-31B-it","type":"text-generation","reported_type":"text-generation","description":"Gemma is a family of open models built by Google DeepMind. Gemma 4 models are multimodal, handling text and image input and generating text output.","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-4-31B-it/cover_image.0b8bfa626859a2b86c0f60c5bcb5847521cbd34a013b8f485b59e80f3a100836.webp","tags":["multimodal","openai","tools","cc-native","structured-output","json","reasoning","can-disable-reasoning","input-audio","input-video","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.3e-05,"cents_per_output_token":3.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-04-07T22:59:57+00:00","private":0,"is_partner":false},{"model_name":"MiniMaxAI/MiniMax-M3","type":"text-generation","reported_type":"text-generation","description":"MiniMax-M3 is a native multimodal model with 1M context. It has ~428B parameters and ~23B activated parameters.","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M3/cover_image.811cbc3e080f92da03204dc6921c5d48b7b9b3cedf5fdf85f1228a9b1a6161c5.webp","tags":["openai","cc-native","multimodal","tools","reasoning","json","input-video"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-05,"cents_per_output_token":0.00012,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":524288,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-07-03T16:10:49+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-5","type":"text-generation","reported_type":"text-generation","description":"GLM-5 is an advanced, open-source large language model designed for developers tackling the toughest challenges. It excels at long-context reasoning, multi-step tool orchestration, and complex systems engineering, making it the ideal choice for powering sophisticated agents and applications that require high-level cognitive tasks.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-5/cover_image.78a934a7d86ef8500dd88ce7a7bb958032b7fc881ea41b2ac94ad4caa30d2693.webp","tags":["openai","reasoning","can-disable-reasoning","tools","json","structured-output","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-05,"cents_per_output_token":0.000208,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":202752,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-02-11T18:08:58+00:00","private":0,"is_partner":false},{"model_name":"openai/gpt-oss-20b","type":"text-generation","reported_type":"text-generation","description":"gpt-oss-20b is an open-weight 21B parameter model released by OpenAI under the Apache 2.0 license. It uses a Mixture-of-Experts (MoE) architecture with 3.6B active parameters per forward pass, optimized for lower-latency inference. The model is trained in OpenAI’s Harmony response format and supports reasoning level configuration, fine-tuning, and agentic capabilities including function calling, tool use, and structured outputs.","cover_img_url":"https://shared.deepinfra.com/models/openai/gpt-oss-20b/cover_image.4841ba1e78220022449bec081f219edc092a81653a7a32e8984ddec2d7ad0f3d.webp","tags":["openai","tools","reasoning","json","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3e-06,"cents_per_output_token":1.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-08-06T00:56:44+00:00","private":0,"is_partner":false},{"model_name":"allenai/olmOCR-7B-0725-FP8","type":"text-generation","reported_type":"text-generation","description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","cover_img_url":"https://shared.deepinfra.com/models/allenai/olmOCR-7B-0725-FP8/cover_image.71dc051781ee539bf76a588020be60a5c41af5b89b344f0cc4c85da930c784f2.webp","tags":["openai","multimodal","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.7e-05,"cents_per_output_token":0.00015,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120282,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-08-02T21:43:01+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Embedding-8B","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-8B/cover_image.339f1c6c3b4e78e3176830085fd8e6d6b684c1ea94fe98e615eebc58db02bd00.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-06-10T13:03:42+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-VL-4B-Thinking","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-4B-Thinking/cover_image.4a81c09d7f340b26c73d65d215a59246ccc66f144767f40acc63a4bb546be553.webp","tags":["openai","cc-native","multimodal","json","structured-output","reasoning","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":0.0001,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762816775,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:31:44+00:00","private":0,"is_partner":false},{"model_name":"ClarityAI/creative","type":"text-to-image","reported_type":"text-to-image","description":"ClarityAI/creative is an AI-powered image upscaler that enhances details, adds realism, and creatively modifies images to improve their quality and visual appeal.","cover_img_url":"","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":5.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-26T12:49:49+00:00","private":0,"is_partner":true},{"model_name":"openai/whisper-timestamped-medium.en","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a set of multi-lingual, robust speech recognition models trained by OpenAI that achieve state-of-the-art results in many languages. Whisper models were trained to predict approximate timestamps on speech segments (most of the time with 1-second accuracy), but they cannot originally predict word timestamps. This variant contains implementation to predict word timestamps and provide a more accurate estimation of speech segments when transcribing with Whisper models.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034218,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-23T00:25:21+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Reranker-8B","type":"reranker","reported_type":"reranker","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B)","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Reranker-8B/cover_image.64532474108cce61f32c9aa7f01fc1a46e27722ca71cf49a1f8e89403c8445df.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-07-02T10:16:13+00:00","private":0,"is_partner":false},{"model_name":"shibing624/text2vec-base-chinese","type":"embeddings","reported_type":"embeddings","description":"A sentence similarity model that can be used for various NLP tasks such as text classification, sentiment analysis, named entity recognition, question answering, and more. It utilizes the CoSENT architecture, which consists of a transformer encoder and a pooling module, to encode input texts into vectors that capture their semantic meaning. The model was trained on the nli_zh dataset and achieved high performance on various benchmark datasets.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-28T17:50:21+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_mask_by_key_points","type":"text-to-video","reported_type":"text-to-video","description":"Identify and segment objects across video frames using specific coordinate points. Just point in the right direction and the model will figure out by itself which object should be masked.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_mask_by_key_points/cover_image.7b9b1dbb1ae0c00b15495d6125ed5384983fee39daa9484b930344bebcb72353.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":14.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:12:00+00:00","private":0,"is_partner":true},{"model_name":"black-forest-labs/FLUX.1-Kontext-dev","type":"text-to-image","reported_type":"text-to-image","description":"FLUX.1 Kontext [dev] is a 12-billion-parameter image editing model that transforms visuals based on natural language instructions. It allows highly consistent, multi-step edits and is released with open weights under a non-commercial license to empower artists and researchers.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX.1-Kontext-dev/cover_image.c9d1121a1b3a9ceabe86aaa79861871dfe2dedc583764b97ddd442f62d86e7f4.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.0,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-07-28T18:11:20+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-opus-4-8","type":"text-generation","reported_type":"text-generation","description":"Claude Opus 4.8 is our most intelligent Opus model and the best generally available model for coding and agents, with deeper reasoning for enterprise workflows.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-opus-4-8/cover_image.7b99143fea9b57c3cd8c35cf3300ce6f5d72e03a79a39f0ae00a3d7bfb264fb0.webp","tags":["multimodal","no-free-anon","structured-output","cc-native","openai","ocr","json","reasoning","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0005,"cents_per_output_token":0.0025,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-29T14:33:49+00:00","private":0,"is_partner":true},{"model_name":"stabilityai/stable-diffusion-2-1","type":"text-to-image","reported_type":"text-to-image","description":"Stable Diffusion is a latent text-to-image diffusion model. Generate realistic images given text description","cover_img_url":"https://shared.deepinfra.com/models/stabilityai/stable-diffusion-2-1/cover_image.edbcfabd7061fda746b16334591fe18f7064ac943cdc7c9e4a819cf108207834.jpg","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1727456832,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-17T02:06:19+00:00","private":0,"is_partner":false},{"model_name":"google/veo-3.0-fast","type":"text-to-video","reported_type":"text-to-video","description":"Veo 3 Fast is a speed-optimized version of the Veo 3 model, designed for rapid video creation. While maintaining high quality, it delivers results in a fraction of the time, making it ideal for quick iterations and dynamic content generation.","cover_img_url":"https://shared.deepinfra.com/models/google/veo-3.0-fast/cover_image.d0ccd94f2e9b613048b22e9323bb73ac954e8bfb74ca6dacc711b3cf1c36bc46.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":15.0},"max_tokens":null,"replaced_by":"google/veo-3.1-fast","deprecated":1779305255,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-08-13T11:35:44+00:00","private":0,"is_partner":true},{"model_name":"meta-llama/Llama-4-Scout-17B-16E-Instruct","type":"text-generation","reported_type":"text-generation","description":"The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding. Llama 4 Scout, a 17 billion parameter model with 16 experts","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-4-Scout-17B-16E-Instruct/cover_image.8dcc2374ffbf18a0ca87e5bddb0f6b8dbc59890038f5dedc837a3b6793e47413.webp","tags":["ocr","openai","tools","multimodal","b200","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":3e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":327680,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-05T21:27:01+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Llama-3.1-Nemotron-70B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Llama-3.1-Nemotron-70B-Instruct is a large language model customized by NVIDIA to improve the helpfulness of LLM generated responses to user queries. This model reaches Arena Hard of 85.0, AlpacaEval 2 LC of 57.6 and GPT-4-Turbo MT-Bench of 8.98, which are known to be predictive of LMSys Chatbot Arena Elo.  As of 16th Oct 2024, this model is #1 on all three automatic alignment benchmarks (verified tab for AlpacaEval 2 LC), edging out strong frontier models such as GPT-4o and Claude 3.5 Sonnet.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Llama-3.1-Nemotron-70B-Instruct/cover_image.31b2930c198e95eb94b86c256b172840b428d943675d5552f3de80eb89170d00.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00012,"cents_per_output_token":0.00012,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning","deprecated":1778120245,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-10-16T17:47:08+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Nemotron-3-Nano-30B-A3B","type":"text-generation","reported_type":"text-generation","description":"NVIDIA Nemotron 3 Nano is an open small reasoning model optimized for fast, cost-efficient inference in agentic and production workloads. Built with a hybrid Mixture-of-Experts (MoE) and Mamba-Transformer architecture, it delivers strong multi-step reasoning, high token throughput, stable latency with predictable cost, and efficient deployment for agent-based systems. Designed for real-world AI systems where reasoning can generate significantly more tokens per prompt, Nemotron Nano reduces compute cost while maintaining strong reasoning quality.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Nemotron-3-Nano-30B-A3B/cover_image.fe979d7e84a6a3ea8700a8c9b5179e4c0943ddc457cd841ae84bd9e06fef4b9b.webp","tags":["tools","reasoning","can-disable-reasoning","openai","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-06,"cents_per_output_token":2e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-12-15T03:01:09+00:00","private":0,"is_partner":false},{"model_name":"allenai/olmOCR-2","type":"text-generation","reported_type":"text-generation","description":"olmOCR is a specialized AI tool that converts PDF documents into clean, structured text while preserving important formatting and layout information. What makes olmOCR particularly valuable for developers is its ability to handle challenging PDFs that traditional OCR tools struggle with—including complex layouts, poor-quality scans, handwritten text, and documents with mixed content types. Built on a fine-tuned 7B vision-language model, olmOCR provides enterprise-grade PDF processing at a fraction of the cost of proprietary solutions.","cover_img_url":"https://shared.deepinfra.com/models/allenai/olmOCR-2/cover_image.b1d92d03d293c67dc223bd3ec07e1c485586e75d275544b8e6cb00d722160015.webp","tags":["openai","multimodal","cc-native","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120282,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-21T22:36:50+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.7-R2V","type":"text-to-video","reported_type":"text-to-video","description":"Accurately preserve the look and voice of people or objects from a reference video, supporting multi-reference co-creation.","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.7-R2V/cover_image.e7d297f74365f693cdbd14a425e82b7593e45b6e60cf4178ca7c909c55d6ec77.webp","tags":["no-free-anon"],"pricing":{"short":"$0.10 / second","full":"$0.15 / second for 1080P, $0.10 / second for 720P","table":{"columns":["resolution","$ cost per second"],"rows":[["780P","$0.10"],["1080P","$0.15"]]},"type":"output_length","cents_per_output_sec":10.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-27T15:25:20+00:00","private":0,"is_partner":true},{"model_name":"openai/whisper-small.en","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation, trained on 680k hours of labelled data without the need for fine-tuning. It is a Transformer based encoder-decoder model, trained on either English-only or multilingual data, and is available in five configurations of varying model sizes. The models were trained on the tasks of speech recognition and speech translation, predicting transcriptions in the same or different languages as the audio.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034191,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-15T22:56:04+00:00","private":0,"is_partner":false},{"model_name":"cognitivecomputations/dolphin-2.9.1-llama-3-70b","type":"text-generation","reported_type":"text-generation","description":"Dolphin 2.9.1, a fine-tuned Llama-3-70b model. The new model, trained on filtered data, is more compliant but uncensored. It demonstrates improvements in instruction, conversation, coding, and function calling abilities.","cover_img_url":"https://shared.deepinfra.com/models/cognitivecomputations/dolphin-2.9.1-llama-3-70b/cover_image.4192f8825b0d9ba661693dc1dc02fd51866bee235787a4c2eb012353cff140a3.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"Qwen/Qwen2.5-72B-Instruct","deprecated":1728579753,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-07-16T23:46:15+00:00","private":0,"is_partner":false},{"model_name":"bigcode/starcoder2-15b-instruct-v0.1","type":"text-generation","reported_type":"text-generation","description":"We introduce StarCoder2-15B-Instruct-v0.1, the very first entirely self-aligned code Large Language Model (LLM) trained with a fully permissive and transparent pipeline. Our open-source pipeline uses StarCoder2-15B to generate thousands of instruction-response pairs, which are then used to fine-tune StarCoder-15B itself without any human annotations or distilled data from huge and proprietary LLMs.","cover_img_url":"https://shared.deepinfra.com/models/bigcode/starcoder2-15b-instruct-v0.1/cover_image.f20836d5d430a93c8aa95eefc92154d14e93dc69b17967410ecce3187a07c601.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.5e-05,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":null,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1781217521,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-05-08T00:06:29+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-VL-8B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-8B-Instruct/cover_image.ec5ebb48ae31ee78754d76b2d4e4a3effdc157984f9c31208d7362d89f16e136.webp","tags":["openai","cc-native","multimodal","json","structured-output","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.8e-05,"cents_per_output_token":6.9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762816773,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:32:12+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/cover_image.cad8905e3734dbb35efcadd521b580b60745e251dfb88235f96d2bc3aa397fdd.webp","tags":["json","openai","tools","non-reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-06,"cents_per_output_token":3e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":45.59,"expected":null,"create_ts":"2024-11-14T17:25:38+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Reranker-0.6B","type":"reranker","reported_type":"reranker","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B)","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Reranker-0.6B/cover_image.64532474108cce61f32c9aa7f01fc1a46e27722ca71cf49a1f8e89403c8445df.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-07-01T21:00:35+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-4.5","type":"text-generation","reported_type":"text-generation","description":"The GLM-4.5 series models are foundation models designed for intelligent agents. GLM-4.5 has 355 billion total parameters with 32 billion active parameters, while GLM-4.5-Air adopts a more compact design with 106 billion total parameters and 12 billion active parameters. GLM-4.5 models unify reasoning, coding, and intelligent agent capabilities to meet the complex demands of intelligent agent applications.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.5/cover_image.4c33e3b1f21ba89bd8e5c963c737730c2579575987bded69b986088117089161.webp","tags":["openai","cc-native","tools","reasoning","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.8e-05,"cents_per_output_token":0.00016,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"zai-org/GLM-4.6","deprecated":1761350192,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-28T19:56:23+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-R1-0528","type":"text-generation","reported_type":"text-generation","description":"The DeepSeek R1 model has undergone a minor version upgrade, with the current version being DeepSeek-R1-0528.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1-0528/cover_image.1c2b6d82000fef8e12c4318b3f5b5d71411fc73bcf000ff2674b32e6af8dba75.webp","tags":["openai","tools","reasoning","b200","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.000215,"rate_per_input_token_cached":0.7,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-05-28T18:36:15+00:00","private":0,"is_partner":false},{"model_name":"stepfun-ai/Step-3.7-Flash","type":"text-generation","reported_type":"text-generation","description":"Step 3.7 Flash is an open-source multimodal reasoning model by StepFun with 198B total parameters (11B active) using Mixture of Experts. It accepts   text and image inputs and features a 256K context window, selectable reasoning effort, tool calling, and agentic capabilities for coding and search workflows, scoring   80.9% on GPQA Diamond and 56.3% on SWE-bench Pro.","cover_img_url":"https://shared.deepinfra.com/models/stepfun-ai/Step-3.7-Flash/cover_image.a209d31b78754c186844fde80ed0ffbca78f773c0a59f99ca4c900a2257759c7.webp","tags":["openai","multimodal","cc-native","tools","input-video"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":0.000115,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"modelopt","mmlu":null,"expected":null,"create_ts":"2026-06-10T23:04:58+00:00","private":0,"is_partner":false},{"model_name":"Gryphe/MythoMax-L2-13b","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"https://shared.deepinfra.com/models/Gryphe/MythoMax-L2-13b/cover_image.7567613d62f797fa930227a88202f7aee5ef30da38e6c9c7b775979cc71220bc.webp","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":null,"deprecated":null,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-10-04T23:04:03+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3.1-405B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3.1-405B-Instruct/cover_image.3613f6cc9339458731dfdff96dbf39481e1857ffc29d112eb5ef5426556f93e1.webp","tags":["openai","tools","no-free-anon","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"NousResearch/Hermes-3-Llama-3.1-405B","deprecated":1752207171,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-07-24T03:48:20+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-V3-0324-Turbo","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","tools","b200"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0001,"cents_per_output_token":0.0003,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"deepseek-ai/DeepSeek-V3-0324","deprecated":1756857096,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-06-03T06:02:01+00:00","private":0,"is_partner":false},{"model_name":"openai/clip-vit-large-patch14-336","type":"zero-shot-image-classification","reported_type":"zero-shot-image-classification","description":"A zero-shot-image-classification model released by OpenAI.\nThe clip-vit-large-patch14-336 model was trained from scratch on an unknown dataset and achieves unspecified results on the evaluation set. The model's intended uses and limitations, as well as its training and evaluation data, are not provided. The training procedure used an unknown optimizer and precision, and the framework versions included Transformers 4.21.3, TensorFlow 2.8.2, and Tokenizers 0.12.1.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-05-01T23:58:05+00:00","private":0,"is_partner":false},{"model_name":"microsoft/phi-4-reasoning-plus","type":"text-generation","reported_type":"text-generation","description":"Phi-4-reasoning-plus is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning on a dataset of chain-of-thought traces and reinforcement learning. The supervised fine-tuning dataset includes a blend of synthetic prompts and high-quality filtered data from public domain websites, focused on math, science, and coding skills as well as alignment data for safety and Responsible AI. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning. Phi-4-reasoning-plus has been trained additionally with Reinforcement Learning, hence, it has higher accuracy but generates on average 50% more tokens, thus having higher latency.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":3.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"microsoft/phi-4","deprecated":1754935816,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-05-01T21:33:11+00:00","private":0,"is_partner":false},{"model_name":"openai/whisper-timestamped-medium","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a set of multi-lingual, robust speech recognition models trained by OpenAI that achieve state-of-the-art results in many languages. Whisper models were trained to predict approximate timestamps on speech segments (most of the time with 1-second accuracy), but they cannot originally predict word timestamps. This version has implementation to predict word timestamps and provide a more accurate estimation of speech segments when transcribing with Whisper models.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034220,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-22T00:27:10+00:00","private":0,"is_partner":false},{"model_name":"Gryphe/MythoMax-L2-13b-turbo","type":"text-generation","reported_type":"text-generation","description":"Faster version of Gryphe/MythoMax-L2-13b running on multiple H100 cards in fp8 precision. Up to 160 tps. ","cover_img_url":"https://shared.deepinfra.com/models/Gryphe/MythoMax-L2-13b-turbo/cover_image.1975a8dba4bcf6809a083fe29aff31cb2895a749171f89e0db650bc039da051a.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.3e-05,"cents_per_output_token":1.3e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"Gryphe/MythoMax-L2-13b","deprecated":1718830497,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-05-07T23:29:58+00:00","private":0,"is_partner":false},{"model_name":"ByteDance/Seed-2.0-mini","type":"text-generation","reported_type":"text-generation","description":"Built for low-latency, high-concurrency, cost-sensitive use cases, with flexible deployment, four-tier thinking, and multimodal","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seed-2.0-mini/cover_image.901f096e1d9028c059113b608c96004090fc4ec7d4bcc50bc257614ad3a0e214.webp","tags":["openai","tools","reasoning","multimodal","no-free-anon","json","structured-output","cc-native"],"pricing":{"short":null,"full":"$0.10 in $0.40 out $0.02 cached <= 128K, $0.2 in $0.80 out $0.2 cached","table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-17T12:44:41+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-VL-4B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-4B-Instruct/cover_image.df7d2149dd050d8b27718b228137e5782e790c5ffffc792a9bc3e88d283a70af.webp","tags":["openai","cc-native","multimodal","json","structured-output","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762816774,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:31:56+00:00","private":0,"is_partner":false},{"model_name":"Pixverse/Pixverse-6-I2V","type":"text-to-video","reported_type":"text-to-video","description":"PixVerse V6 redefines AI video by shifting from isolated generation to a unified, model-driven workflow. Key upgrades include 15-second durations at 1080p resolution and a multi-shot engine. This transition allows creators to move beyond short clips toward meaningful narrative production and professional-grade marketing assets suitable for 2026 digital distribution standards.","cover_img_url":"https://shared.deepinfra.com/models/Pixverse/Pixverse-6-I2V/cover_image.6399dc983b6c60d5505008a0c413f6c6cce7fd9b08ca2d3b0f595a10c72a2850.webp","tags":["no-free-anon"],"pricing":{"short":"$0.045 / second","full":"Per-second pricing varies by quality and audio toggle. 360p: $0.025/$0.035 · 540p: $0.035/$0.045 · 720p: $0.045/$0.060 · 1080p: $0.090/$0.115 (no audio /\n  with audio).","table":null,"type":"output_length","cents_per_output_sec":4.5},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-29T13:58:05+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-Next-80B-A3B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Over the past few months, we have observed increasingly clear trends toward scaling both total parameters and context lengths in the pursuit of more powerful and agentic artificial intelligence (AI). We are excited to share our latest advancements in addressing these demands, centered on improving scaling efficiency through innovative model architecture. We call this next-generation foundation models Qwen3-Next.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Next-80B-A3B-Instruct/cover_image.8e9e1997110314a24b6645cf5ecbc3e4bd4dd456d2d109dd42326c0918eaa934.webp","tags":["openai","tools","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9e-06,"cents_per_output_token":0.00011,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-09-11T21:28:04+00:00","private":0,"is_partner":false},{"model_name":"FastVideo/LTX-2.3-Distilled-Diffusers","type":"text-to-video","reported_type":"text-to-video","description":"A fast, step-distilled build of Lightricks' LTX-2.3 diffusion-transformer video model (distilled by FastVideo).  Generates high-fidelity text-to-video and image-to-video in just a few denoising steps.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":3.5},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-20T02:25:26+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-5.1","type":"text-generation","reported_type":"text-generation","description":"GLM-5.1 is Z-AI's next-generation flagship model for agentic engineering, with significantly stronger coding capabilities than its predecessor. It achieves state-of-the-art performance on SWE-Bench Pro and leads GLM-5 by a wide margin on NL2Repo (repo generation) and Terminal-Bench 2.0 (real-world terminal tasks).","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-5.1/cover_image.232e79427fb8bc75c65cfdee32570430d6bfa2f692a8e084077d2f06d7f2e0da.webp","tags":["reasoning","openai","tools","can-disable-reasoning","structured-output","json","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.000105,"cents_per_output_token":0.00035,"rate_per_input_token_cached":0.1952381,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":202752,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-04-07T05:09:59+00:00","private":0,"is_partner":false},{"model_name":"Zyphra/Zonos-v0.1-hybrid","type":"text-to-speech","reported_type":"text-to-speech","description":"Zonos-v0.1 is a leading open-weight text-to-speech model trained on more than 200k hours of varied multilingual speech, delivering expressiveness and quality on par with—or even surpassing—top TTS providers.  Our model enables highly natural speech generation from text prompts when given a speaker embedding or audio prefix, and can accurately perform speech cloning when given a reference clip spanning just a few seconds. The conditioning setup also allows for fine control over speaking rate, pitch variation, audio quality, and emotions such as happiness, fear, sadness, and anger. The model outputs speech natively at 44kHz.","cover_img_url":"https://shared.deepinfra.com/models/Zyphra/Zonos-v0.1-hybrid/cover_image.d68da023e1c56e479cfb000ee0a75324a9876fd8ed12a51ee0efdfa1640ed53d.webp","tags":["voice"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0007},"max_tokens":null,"replaced_by":null,"deprecated":1778878702,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-14T19:48:46+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-VL-235B-A22B-Thinking","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-235B-A22B-Thinking/cover_image.0cfb76217c176bd49a4d2f6a4163d74fc3789f7bf4ce17a33bab2722be230ff5.webp","tags":["openai","cc-native","multimodal","json","structured-output","reasoning","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.5e-05,"cents_per_output_token":0.000349,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762366993,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:30:59+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Llama-3.3-Nemotron-Super-49B-v1.5","type":"text-generation","reported_type":"text-generation","description":"Llama-3.3-Nemotron-Super-49B-v1.5 is a large language model (LLM) optimized for advanced reasoning, conversational interactions, retrieval-augmented generation (RAG), and tool-calling tasks. Derived from Meta's Llama-3.3-70B-Instruct, it employs a Neural Architecture Search (NAS) approach, significantly enhancing efficiency and reducing memory requirements. ","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Llama-3.3-Nemotron-Super-49B-v1.5/cover_image.1ac20858cd086a9c458798b2cca22b1ef8cc50b8133dc302ad9969eaffe2757a.webp","tags":["openai","tools","reasoning","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-09-09T21:56:24+00:00","private":0,"is_partner":false},{"model_name":"PaddlePaddle/PaddleOCR-VL-0.9B","type":"text-generation","reported_type":"text-generation","description":"PaddleOCR-VL is a SOTA and resource-efficient model tailored for document parsing. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful vision-language model (VLM) that integrates a NaViT-style dynamic resolution visual encoder with the ERNIE-4.5-0.3B language model to enable accurate element recognition. This innovative model efficiently supports 109 languages and excels in recognizing complex elements (e.g., text, tables, formulas, and charts), while maintaining minimal resource consumption. Through comprehensive evaluations on widely used public benchmarks and in-house benchmarks, PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing solutions, exhibits strong competitiveness against top-tier VLMs, and delivers fast inference speeds. These strengths make it highly suitable for practical deployment in real-world scenarios.","cover_img_url":"https://shared.deepinfra.com/models/PaddlePaddle/PaddleOCR-VL-0.9B/cover_image.2e629a328c9d200645698b53649877cb61e825f949b0223c6c2144455042d80b.webp","tags":["ocr","openai","multimodal","ocr-ui","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120344,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-10-17T21:19:37+00:00","private":0,"is_partner":false},{"model_name":"CompVis/stable-diffusion-v1-4","type":"text-to-image","reported_type":"text-to-image","description":"Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1727456630,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-17T01:54:00+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Embedding-0.6B-batch","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-0.6B-batch/cover_image.0d97a5a6c8c888a5d0de7fddbe265531552c2c852759bdde17ecede2618d4a25.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-Embedding-0.6B","deprecated":1783352679,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-30T12:25:58+00:00","private":0,"is_partner":false},{"model_name":"sesame/csm-1b","type":"text-to-speech","reported_type":"text-to-speech","description":"CSM (Conversational Speech Model) is a speech generation model from Sesame that generates RVQ audio codes from text and audio inputs. The model architecture employs a Llama backbone and a smaller audio decoder that produces Mimi audio codes.","cover_img_url":"https://shared.deepinfra.com/models/sesame/csm-1b/cover_image.e79911aabdd060a3939b5188731bb689a92a627c7e9a0d6e35574937707997ab.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0007},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-15T02:12:49+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-4-opus","type":"text-generation","reported_type":"text-generation","description":"Anthropic’s most powerful model yet and the state-of-the-art coding model. It delivers sustained performance on long-running tasks that require focused effort and thousands of steps, significantly expanding what AI agents can solve. Claude Opus 4 is ideal for powering frontier agent products and features.","cover_img_url":"https://shared.deepinfra.com/models/anthropic/claude-4-opus/cover_image.751b974708687a8076f1ed6c27922e9cf8d4334135b811e60387e8957ffcd248.webp","tags":["ocr","openai","tools","multimodal","no-free-anon","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00165,"cents_per_output_token":0.00825,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":200000,"replaced_by":"anthropic/claude-opus-4-7","deprecated":1779304896,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-06-12T17:44:49+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3-32B","type":"text-generation","reported_type":"text-generation","description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-32B/cover_image.011f77376c7d6752c9f56907c8fd353c3484a570d3d80f0d61e6fdc03379525b.webp","tags":["openai","tools","reasoning","json","non-reasoning","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8e-06,"cents_per_output_token":2.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":40960,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-28T23:14:24+00:00","private":0,"is_partner":false},{"model_name":"Sao10K/L3.1-70B-Euryale-v2.2","type":"text-generation","reported_type":"text-generation","description":"Euryale 3.1 - 70B v2.2 is a model focused on creative roleplay from Sao10k","cover_img_url":"https://shared.deepinfra.com/models/Sao10K/L3.1-70B-Euryale-v2.2/cover_image.f22f2e0fffe03183d6214faed551f62cdaaa3a3c6eefd1bf46b7d3c00b0b4ec4.webp","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8.5e-05,"cents_per_output_token":8.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-09-12T18:00:32+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-1.1-7b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma is an open-source model designed by Google. This is Gemma 1.1 7B (IT), an update over the original instruction-tuned Gemma release. Gemma 1.1 was trained using a novel RLHF method, leading to substantial gains on quality, coding capabilities, factuality, instruction following and multi-turn conversation quality.","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-1.1-7b-it/cover_image.18ce701e05fe6377f27c31c2e0d05649b7962f184f8dc894e31aad3ffb468f70.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":7e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"google/gemma-3-12b-it","deprecated":1743551605,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-04-09T23:56:27+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-1-dev","type":"text-to-image","reported_type":"text-to-image","description":"FLUX.1-dev is a state-of-the-art 12 billion parameter rectified flow transformer developed by Black Forest Labs. This model excels in text-to-image generation, providing highly accurate and detailed outputs. It is particularly well-regarded for its ability to follow complex prompts and generate anatomically accurate images, especially with challenging details like hands and faces.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-1-dev/cover_image.0b41e8b873e624b57bb240b9a3be7025b1bc72cbd786ca65cdd148ac2bd71636.webp","tags":["no-free-anon","openai","lora-base"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.9,"default_width":1024,"default_height":1024,"default_iterations":25,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-08-21T12:09:53+00:00","private":0,"is_partner":false},{"model_name":"FastVideo/LTX2-Distilled-Diffusers","type":"text-to-video","reported_type":"text-to-video","description":"LTX-2 is a DiT-based audio-video foundation model designed to generate synchronized video and audio within a single model. It brings together the core building blocks of modern video generation, with open weights and a focus on practical, local execution.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":3.6},"max_tokens":null,"replaced_by":"FastVideo/LTX-2.3-Distilled-Diffusers","deprecated":1782269990,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-29T18:14:11+00:00","private":0,"is_partner":false},{"model_name":"Qwen/QwQ-32B-Preview","type":"text-generation","reported_type":"text-generation","description":"QwQ is an experimental research model developed by the Qwen Team, designed to advance AI reasoning capabilities. This model embodies the spirit of philosophical inquiry, approaching problems with genuine wonder and doubt. QwQ demonstrates impressive analytical abilities, achieving scores of 65.2% on GPQA, 50.0% on AIME, 90.6% on MATH-500, and 50.0% on LiveCodeBench. With its contemplative approach and exceptional performance on complex problems.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/QwQ-32B-Preview/cover_image.9e4979e6c07cf7f6dc3866756f67a33ccebd2b07f3bbe612fb413e523e69942f.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.2e-05,"cents_per_output_token":1.8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1781220528,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-11-27T23:21:36+00:00","private":0,"is_partner":false},{"model_name":"anthropic/claude-3-7-sonnet-latest","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["ocr","openai","tools","multimodal","no-free-anon","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.00033,"cents_per_output_token":0.00165,"rate_per_input_token_cached":0.1,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":200000,"replaced_by":"anthropic/claude-sonnet-4-6","deprecated":1779304967,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-13T00:10:15+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen2.5-Coder-32B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen). It has significant improvements in code generation, code reasoning and code fixing. A more comprehensive foundation for real-world applications such as Code Agents. Not only enhancing coding capabilities but also maintaining its strengths in mathematics and general competencies.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2.5-Coder-32B-Instruct/cover_image.3f673479a7b03c636f41bb3b3c8ec06713314d247985921cc2d698f589c17758.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-06,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-32B","deprecated":1757622099,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-11-11T23:08:15+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-VL-30B-A3B-Thinking","type":"text-generation","reported_type":"text-generation","description":"Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.  This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-VL-30B-A3B-Thinking/cover_image.64dd36b17da69cd6d73e42787c42512cb4e18c79167beaf46a756ea45b95b588.webp","tags":["openai","cc-native","multimodal","json","structured-output","reasoning","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.9e-05,"cents_per_output_token":9.9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-VL-235B-A22B-Instruct","deprecated":1762816774,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-14T21:31:23+00:00","private":0,"is_partner":false},{"model_name":"Bria/video_remove_background","type":"text-to-video","reported_type":"text-to-video","description":"Light and fast. Remove the background of your videos to bring the foreground elements to focus. No more unwanted distractions.","cover_img_url":"https://shared.deepinfra.com/models/Bria/video_remove_background/cover_image.25eccb6ba831bc3a0ad69c9289f4bfe845ca76c1c56c3c95ae4b1cdcf541e790.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":0.42},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-28T18:10:48+00:00","private":0,"is_partner":true},{"model_name":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","type":"text-generation","reported_type":"text-generation","description":"Mistral-Small-3.2-24B-Instruct is a drop-in upgrade over the 3.1 release, with markedly better instruction following, roughly half the infinite-generation errors, and a more robust function-calling interface—while otherwise matching or slightly improving on all previous text and vision benchmarks.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-Small-3.2-24B-Instruct-2506/cover_image.2baa436d9d36ef7306a532f4f6625652dfd092c04b2c23057fb8cdb69e64915d.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7.5e-06,"cents_per_output_token":2e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":128000,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-06-23T23:11:49+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen2.5-VL-32B-Instruct","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["ocr","openai","multimodal","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":128000,"replaced_by":"Qwen/Qwen3-VL-30B-A3B-Instruct","deprecated":1777080884,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-07-15T22:59:49+00:00","private":0,"is_partner":false},{"model_name":"MiniMaxAI/MiniMax-M2.7-Turbo","type":"text-generation","reported_type":"text-generation","description":"Speed-optimized MiniMax-M2.7","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M2.7-Turbo/cover_image.707136a4a7ec43bb0a8199f51ab6cfef1f9b23ecbf799fd2b621e3a65352bf08.webp","tags":["openai","cc-native","no-free-anon","json","tools","reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":196608,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-06-15T21:06:31+00:00","private":0,"is_partner":true},{"model_name":"ByteDance/Seedream-4","type":"text-to-image","reported_type":"text-to-image","description":"Seedream 4.0 is a SOTA multimodal image creation model built on leading architecture. It breaks through the boundaries of traditional text-to-image models by natively supporting text, single-image, and multi-image inputs. Users can freely combine text and images to achieve diverse creative modes within a single model—such as multi-image blending, image editing, and sequentially batch image generation, featuring subject consistency, making image creation more free and controllable.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seedream-4/cover_image.480f154335adb8c3ec195ffa092027f8701dc877e90a8b9bc9ed2a718232522d.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-24T14:55:58+00:00","private":0,"is_partner":true},{"model_name":"google/gemini-2.0-flash-001","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","no-free-anon","multimodal","tools","cc-native","json","structured-output","ocr"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":"google/gemini-2.5-flash","deprecated":1770336000,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-04T23:46:55+00:00","private":0,"is_partner":true},{"model_name":"meta-llama/Llama-2-70b-chat-hf","type":"text-generation","reported_type":"text-generation","description":"LLaMa 2 is a collections of LLMs trained by Meta. This is the 70B chat optimized version. This endpoint has per token pricing.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-2-70b-chat-hf/cover_image.7b3407408b20bd422edfb75da90ee92d0a05649e94b59bf409c827e845fc3c46.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6.4e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo","deprecated":1757621725,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-08-09T18:01:13+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Max","type":"text-generation","reported_type":"text-generation","description":"The latest flagship model in the Qwen family. State-of-the-art results across a comprehensive suite of benchmarks — including knowledge, reasoning, coding, instruction following, human preference alignment, agent tasks, and multilingual understanding.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Max/cover_image.3c5c67a21ba4b7489e6d6531bd6b115793411327124fb93ac886a66eb598ba7e.webp","tags":["openai","tools","no-free-anon","json","non-reasoning","structured-output","cc-native","featured"],"pricing":{"short":"$1.20 in $6.00 out $0.24 cached / 1M tokens","full":"$1.2 in $6 out $0.24 cached <= 32K, $2.4 in $12 out $0.48 cached <= 128K, $3 in $15 out $0.6 cached > 128K","table":null,"type":"tokens","cents_per_input_token":0.00012,"cents_per_output_token":0.0006,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-11T15:43:27+00:00","private":0,"is_partner":true},{"model_name":"zai-org/GLM-5.2","type":"text-generation","reported_type":"text-generation","description":"GLM-5.2 is Z-AI's latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-5.2/cover_image.0ef2f8af7ca19a7dcb7e8fe4b2d7d7443314654d3d3a496917eea59b07e4f8a1.webp","tags":["openai","tools","json","structured-output","reasoning","can-disable-reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9.3e-05,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.19354839,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":1048576,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-06-16T16:09:11+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-3.3-70B-Instruct-Turbo","type":"text-generation","reported_type":"text-generation","description":"Llama 3.3-70B Turbo is a highly optimized version of the Llama 3.3-70B model, utilizing FP8 quantization to deliver significantly faster inference speeds with a minor trade-off in accuracy. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.3-70B-Instruct-Turbo/cover_image.4b398fbacb19ee745df560ce0b12f6f1f2cc6e05ae3c28aa82a9af332086b8df.webp","tags":["openai","tools","b200","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-05,"cents_per_output_token":3.2e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-12-06T18:20:15+00:00","private":0,"is_partner":false},{"model_name":"Pixverse/Pixverse-T2V","type":"text-to-video","reported_type":"text-to-video","description":"PixVerse's 720p resolution offers a fast and reliable option for generating standard HD videos, ideal for quick previews and social media content where generation speed is prioritized over maximum detail.","cover_img_url":"https://shared.deepinfra.com/models/Pixverse/Pixverse-T2V/cover_image.eb3175d803d5c4ceafe31fa5f63fee5c7d57b8403c368c798ba164c2dd7948d1.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":20.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-23T15:43:25+00:00","private":0,"is_partner":true},{"model_name":"stabilityai/sdxl-turbo","type":"text-to-image","reported_type":"text-to-image","description":"The SDXL Turbo model, developed by Stability AI, is an optimized, fast text-to-image generative model. It is a distilled version of SDXL 1.0, leveraging Adversarial Diffusion Distillation (ADD) to generate high-quality images in less steps.","cover_img_url":"https://shared.deepinfra.com/models/stabilityai/sdxl-turbo/cover_image.ab5db98dcb6b4ed16f0b93c4187ef3e8dc6675e7ac0cff482f1504ed89f1fabc.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.02,"default_width":1024,"default_height":1024,"default_iterations":5,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-09-06T09:56:45+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-7B-Instruct-v0.1","type":"text-generation","reported_type":"text-generation","description":"The Mistral-7B-Instruct-v0.1 Large Language Model (LLM) is a instruct fine-tuned version of the Mistral-7B-v0.1 generative text model using a variety of publicly available conversation datasets.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-7B-Instruct-v0.1/cover_image.cdb70679749ff93eb56b8480215bb8cd1382cbeffcec00a06bdb0145be9ef511.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757952363,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-09-28T01:30:09+00:00","private":0,"is_partner":false},{"model_name":"openai/whisper-base.en","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. It was trained on 680k hours of labelled data and demonstrated a strong ability to generalise to many datasets and domains without fine-tuning. Whisper checks pens are available in five configurations of varying model sizes, including a smallest configuration trained on English-only data and a largest configuration trained on multilingual data. This one is English-only.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034223,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-15T22:55:30+00:00","private":0,"is_partner":false},{"model_name":"MiniMaxAI/MiniMax-M2.1","type":"text-generation","reported_type":"text-generation","description":"MiniMax-M2.1 is a model optimized specifically for robustness in coding, tool use, instruction following, and long-horizon planning. From automating multilingual software development to executing complex, multi-step office workflows, MiniMax-M2.1 empowers developers to build the next generation of autonomous applications—all while being fully transparent, controllable, and accessible.","cover_img_url":"https://shared.deepinfra.com/models/MiniMaxAI/MiniMax-M2.1/cover_image.69e4d57cbb8d11fcc2ccfb2db1fcdfafff652c36ad58b29a1b2df75d0bc48206.webp","tags":["openai","tools","json","non-reasoning","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.7e-05,"cents_per_output_token":9.5e-05,"rate_per_input_token_cached":0.10740741,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":196608,"replaced_by":"MiniMaxAI/MiniMax-M2.5","deprecated":1776285489,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-01-05T19:58:25+00:00","private":0,"is_partner":false},{"model_name":"thenlper/gte-large","type":"embeddings","reported_type":"embeddings","description":"The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer three different sizes of models, including GTE-large, GTE-base, and GTE-small. The GTE models are trained on a large-scale corpus of relevance text pairs, covering a wide range of domains and scenarios. This enables the GTE models to be applied to various downstream tasks of text embeddings, including information retrieval, semantic textual similarity, text reranking, etc.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T19:57:40+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-4.5-Air","type":"text-generation","reported_type":"text-generation","description":"The GLM-4.5 series models are foundation models designed for intelligent agents. GLM-4.5 has 355 billion total parameters with 32 billion active parameters, while GLM-4.5-Air adopts a more compact design with 106 billion total parameters and 12 billion active parameters. GLM-4.5 models unify reasoning, coding, and intelligent agent capabilities to meet the complex demands of intelligent agent applications.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.5-Air/cover_image.b81d10678c90f1e1151c0945ae63e53ea8de1162b787b0c69907502841ca2500.webp","tags":["openai","cc-native","tools","reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":0.00011,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"openai/gpt-oss-20b","deprecated":1759872821,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-28T20:18:56+00:00","private":0,"is_partner":false},{"model_name":"microsoft/WizardLM-2-7B","type":"text-generation","reported_type":"text-generation","description":"WizardLM-2 7B is the smaller variant of Microsoft AI's latest Wizard model. It is the fastest and achieves comparable performance with existing 10x larger open-source leading models","cover_img_url":"https://shared.deepinfra.com/models/microsoft/WizardLM-2-7B/cover_image.305876b2901f4833a14d17e09a3876b59ca7561b73017518970a75643fe9ff69.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"microsoft/phi-4","deprecated":1736981768,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-04-16T07:15:38+00:00","private":0,"is_partner":false},{"model_name":"zai-org/GLM-4.7-Flash","type":"text-generation","reported_type":"text-generation","description":"GLM-4.7-Flash is a 30B-A3B MoE model. As the strongest model in the 30B class, GLM-4.7-Flash offers a new option for lightweight deployment that balances performance and efficiency.","cover_img_url":"https://shared.deepinfra.com/models/zai-org/GLM-4.7-Flash/cover_image.3c0c7f830a57de23a3f15107b442a12b8a434b922e45a734dd90791583afd960.webp","tags":["openai","tools","reasoning","json","structured-output","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-06,"cents_per_output_token":4e-05,"rate_per_input_token_cached":0.16666667,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":202752,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-01-28T00:18:07+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Cosmos3-Nano","type":"text-to-video","reported_type":"world-model","description":"Cosmos3 is a world foundation model that unifies understanding and generation within a single Mixture-of-Transformer (MoT) architecture. Two tightly coupled towers—a Reasoner (vision-language model) and a Generator (world simulator)—share latent representations so that structured perception directly grounds realistic, temporally consistent simulation.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Cosmos3-Nano/cover_image.9e0d260ff51b3b1950658c7387b0048dfe88acf1e2f318e2cda55dc55fa55e81.webp","tags":["openai"],"pricing":{"short":"$0.0108 / second (480p)","full":null,"table":null,"type":"frame_units","cents_per_frame_unit":0.045},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-28T21:15:41+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-397B-A17B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-397B-A17B is Alibaba's most capable Qwen3.5 model, a Mixture-of-Experts architecture with 397B total parameters and 17B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling with MCP integration, and support for 201 languages. Sets state-of-the-art results on reasoning, coding, math, and multimodal benchmarks.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-397B-A17B/cover_image.c692d4649df681a3ba8bef636f5954311afb302d2c129bb5799c1da5d20c701e.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.5e-05,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.48888889,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-03-24T01:01:13+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2-Thinking","type":"text-generation","reported_type":"text-generation","description":"Kimi K2 Thinking is the latest, most capable version of open-source thinking model developed by MoonshotAI","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2-Thinking/cover_image.5841c84692f14cc350e7901386cfa0e0e6a49bfd2de40fe12ba89ccfdf6e0a9d.webp","tags":["openai","tools","reasoning","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4.7e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":0.3,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"moonshotai/Kimi-K2.5","deprecated":1776286098,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-11-11T01:53:03+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Llama-3.2-90B-Vision-Instruct","type":"text-generation","reported_type":"text-generation","description":"The Llama 90B Vision model is a top-tier, 90-billion-parameter multimodal model designed for the most challenging visual reasoning and language tasks. It offers unparalleled accuracy in image captioning, visual question answering, and advanced image-text comprehension. Pre-trained on vast multimodal datasets and fine-tuned with human feedback, the Llama 90B Vision is engineered to handle the most demanding image-based AI tasks.  This model is perfect for industries requiring cutting-edge multimodal AI capabilities, particularly those dealing with complex, real-time visual and textual analysis.","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Llama-3.2-90B-Vision-Instruct/cover_image.25052639e37e5db9fa3e1685896ced5c0ef402f08e4da15a8db4d3627236a787.webp","tags":["openai","multimodal","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.5e-05,"cents_per_output_token":4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8","deprecated":1757621478,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-09-27T00:19:24+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/Janus-Pro-7B","type":"text-to-image","reported_type":"text-to-image","description":"Janus-Pro is a novel autoregressive framework that unifies multimodal understanding and generation. It addresses the limitations of previous approaches by decoupling visual encoding into separate pathways, while still utilizing a single, unified transformer architecture for processing. The decoupling not only alleviates the conflict between the visual encoder’s roles in understanding and generation, but also enhances the framework’s flexibility. Janus-Pro surpasses previous unified model and matches or exceeds the performance of task-specific models. The simplicity, high flexibility, and effectiveness of Janus-Pro make it a strong candidate for next-generation unified multimodal models.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/Janus-Pro-7B/cover_image.58ef6d559f158621698e6c176e33b0f290835026420d7e7838a7515130b58998.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":0.2,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-10T10:39:48+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2.6","type":"text-generation","reported_type":"text-generation","description":"Kimi K2.6 is an open-source, native multimodal agentic model that advances practical capabilities in long-horizon coding, coding-driven design, proactive autonomous execution, and swarm-based task orchestration.","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2.6/cover_image.da131f8e266d5af0f8e28c2dcf6575af66977cd4ba0c72ec9147d34b679b5e85.webp","tags":["openai","json","reasoning","tools","structured-output","can-disable-reasoning","multimodal","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7.5e-05,"cents_per_output_token":0.00035,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-04-21T03:13:29+00:00","private":0,"is_partner":false},{"model_name":"google/gemini-3.1-flash-lite","type":"text-generation","reported_type":"text-generation","description":"Bring any idea to life with state-of-the-art reasoning to help you learn, build, and plan anything. Best for high-volume tasks that need efficiency and intelligence.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-3.1-flash-lite/cover_image.525a90a425f2f2c1855b66bdf94fe2025c945e45e2c724d78a4a4c8d67e1d034.webp","tags":["no-free-anon","cc-native","openai","reasoning","tools","structured-output","multimodal","ocr","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.5e-05,"cents_per_output_token":0.00015,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-13T03:51:34+00:00","private":0,"is_partner":true},{"model_name":"black-forest-labs/FLUX-2-dev","type":"text-to-image","reported_type":"text-to-image","description":"Brand-new Flux2 Dev introduces a faster, more modular architecture for next-generation image generation pipelines. It delivers improved performance, cleaner control APIs, and a significantly more flexible development workflow for custom inference setups.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-2-dev/cover_image.97fe66df97f4d737785e650b4a585a19be4e18d52cc430bc2d1b2b6080ae2f31.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.0,"default_width":1024,"default_height":1024,"default_iterations":28,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-11-25T09:44:48+00:00","private":0,"is_partner":false},{"model_name":"openbmb/MiniCPM-Llama3-V-2_5","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","cc-native","multimodal"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.4e-05,"cents_per_output_token":3.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Llama-3.2-11B-Vision-Instruct","deprecated":1728579689,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-08-09T22:02:53+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Max-Thinking","type":"text-generation","reported_type":"text-generation","description":"The latest flagship reasoning model in the Qwen3 family. Further enhanced by multiple innovations like adaptive tool-use and advanced test-time scaling techniques","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Max-Thinking/cover_image.fa7888e2940746f47be3fb3eaa39ecddf187d085e0b96805a24e0db063d46f59.webp","tags":["openai","tools","no-free-anon","json","non-reasoning","structured-output","cc-native","featured"],"pricing":{"short":"$1.20 in $6.00 out $0.24 cached / 1M tokens","full":"$1.2 in $6 out $0.24 cached <= 32K, $2.4 in $12 out $0.48 cached <= 128K, $3 in $15 out $0.6 cached > 128K","table":null,"type":"tokens","cents_per_input_token":0.00012,"cents_per_output_token":0.0006,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-13T15:52:45+00:00","private":0,"is_partner":true},{"model_name":"google/gemini-1.5-flash-8b","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","no-free-anon","multimodal","tools","cc-native","json","structured-output","ocr"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":3.75e-06,"cents_per_output_token":1.5e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":1749069072,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-05T23:30:51+00:00","private":0,"is_partner":true},{"model_name":"microsoft/phi-4","type":"text-generation","reported_type":"text-generation","description":"Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.","cover_img_url":"https://shared.deepinfra.com/models/microsoft/phi-4/cover_image.aee3c097e61545c82caae333d0494bd36718766775525b8fe75f6e8a3e5edc9d.webp","tags":["json","openai","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":1.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":16384,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2025-01-09T22:57:56+00:00","private":0,"is_partner":false},{"model_name":"allenai/Olmo-3.1-32B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Olmo is a series of Open language models, developed by Allen Institute for AI (Ai2), designed to enable the science of language models. ","cover_img_url":"https://shared.deepinfra.com/models/allenai/Olmo-3.1-32B-Instruct/cover_image.4b73f75a4b04349fab603dd34590a01e1e5e4017c39e4e1a67ab4d0e4c385c4b.webp","tags":["openai","tools","json","non-reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":65536,"replaced_by":"google/gemma-4-31B-it","deprecated":1778120303,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-12-17T23:10:54+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen-Image-Max","type":"text-to-image","reported_type":"text-to-image","description":"Compared with the Plus series, it significantly reduces the “AI-like” feel in generated images, enhancing their realism. It delivers more lifelike material textures for human subjects, finer and more detailed natural textures, and more visually appealing text rendering.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen-Image-Max/cover_image.25b81707e89b3656826656c22f7a567566a0ba4ebedfe3cdd96ee29756e8b251.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":7.5,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-02-18T12:43:55+00:00","private":0,"is_partner":true},{"model_name":"sentence-transformers/clip-ViT-B-32-multilingual-v1","type":"embeddings","reported_type":"embeddings","description":"This model is a multilingual version of the OpenAI CLIP-ViT-B32 model, which maps text and images to a common dense vector space. It includes a text embedding model that works for 50+ languages and an image encoder from CLIP. The model was trained using Multilingual Knowledge Distillation, where a multilingual DistilBERT model was trained as a student model to align the vector space of the original CLIP image encoder across many languages.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-03T02:52:46+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Embedding-0.6B","type":"embeddings","reported_type":"embeddings","description":"The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B).","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Embedding-0.6B/cover_image.64532474108cce61f32c9aa7f01fc1a46e27722ca71cf49a1f8e89403c8445df.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":32768,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-07-01T20:40:17+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Nemotron-Content-Safety-3.5","type":"text-generation","reported_type":"text-generation","description":"Nemotron Content Safety 3.5 is a multimodal safety classifier developed by NVIDIA. A  compact safety model that handles text, images, and custom policies. It outputs a safe/unsafe classification plus a reasoning trace, and can be used as an inference-time guardrail, as a judge for LLM safety testing and evaluation, or with the accompanying training dataset to post-train models for safer behavior.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Nemotron-Content-Safety-3.5/cover_image.d06912f7f81adc62820e4e2b25ee302675afc178ef97bcf4c5f99fe567a48759.webp","tags":["multimodal","cc-native","openai","ocr","non-reasoning","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-05,"cents_per_output_token":2e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-06-01T17:31:05+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3.5-122B-A10B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-122B-A10B is a large Mixture-of-Experts model from Alibaba's Qwen3.5 series with 122B total parameters and 10B activated per token. It features a 262K token context window (extensible to 1M with YaRN), thinking/reasoning mode, tool calling, and support for 201 languages. Excels at complex reasoning, coding, multimodal understanding, and agentic tasks with the efficiency of sparse activation.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-122B-A10B/cover_image.547b018401e117840606266bd69c27a9a0139953004899b939e742a4ab6a51b1.webp","tags":["structured-output","tools","json","reasoning","openai","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.9e-05,"cents_per_output_token":0.00024,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":16384,"replaced_by":null,"deprecated":null,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2026-03-24T00:58:08+00:00","private":0,"is_partner":false},{"model_name":"sentence-transformers/multi-qa-mpnet-base-dot-v1","type":"embeddings","reported_type":"embeddings","description":"We present a sentence transformation model that maps sentences and paragraphs to a 768-dimensional dense vector space, suitable for semantic search tasks. The model is trained on 215 million question-answer pairs from various sources, including WikiAnswers, PAQ, Stack Exchange, MS MARCO, GOOAQ, Amazon QA, Yahoo Answers, Search QA, ELI5, and Natural Questions. Our model uses a contrastive learning objective.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-03T02:50:49+00:00","private":0,"is_partner":false},{"model_name":"inworld-ai/inworld-tts-1.5-max","type":"text-to-speech","reported_type":"text-to-speech","description":"High-quality multilingual text-to-speech model by Inworld AI with 130+ preset voices across 15 languages. Supports voice cloning, word-level timestamps, and streaming. Optimized for natural, expressive speech with <250ms time-to-first-audio.","cover_img_url":"https://shared.deepinfra.com/models/inworld-ai/inworld-tts-1.5-max/cover_image.a851df7aa7688c9bd62759d01a937d917c5ca0b91645228d3db77745229d0dd5.webp","tags":["voice","no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.005},"max_tokens":null,"replaced_by":"inworld-ai/realtime-tts-1.5-max","deprecated":1778020021,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-13T00:07:38+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen3.5-0.8B","type":"text-generation","reported_type":"text-generation","description":"Qwen3.5-0.8B is Alibaba's smallest model in the Qwen3.5 series, featuring a hybrid Gated Delta Networks and sparse Mixture-of-Experts architecture. Despite its compact size, it supports a 262K token context window, 201 languages, thinking/reasoning mode, and tool calling. Ideal for edge deployments, resource-constrained environments, and lightweight inference tasks.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3.5-0.8B/cover_image.be7de9546316a3fadcf1ac3d2de3099c3a7b1c79736b45eba43976e5b6ddfd0d.webp","tags":["structured-output","tools","json","reasoning","openai","cc-native","multimodal","input-video","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1e-06,"cents_per_output_token":5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":"Qwen/Qwen3.5-9B","deprecated":1781216919,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2026-03-20T23:26:38+00:00","private":0,"is_partner":false},{"model_name":"Zyphra/Zonos-v0.1-transformer","type":"text-to-speech","reported_type":"text-to-speech","description":"Zonos-v0.1 is a leading open-weight text-to-speech model trained on more than 200k hours of varied multilingual speech, delivering expressiveness and quality on par with—or even surpassing—top TTS providers.  Our model enables highly natural speech generation from text prompts when given a speaker embedding or audio prefix, and can accurately perform speech cloning when given a reference clip spanning just a few seconds. The conditioning setup also allows for fine control over speaking rate, pitch variation, audio quality, and emotions such as happiness, fear, sadness, and anger. The model outputs speech natively at 44kHz.","cover_img_url":"https://shared.deepinfra.com/models/Zyphra/Zonos-v0.1-transformer/cover_image.7a4f844215584e5d01c81a8564e045736391b0161820e3eb9f618442272748a9.webp","tags":["voice"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0007},"max_tokens":null,"replaced_by":null,"deprecated":1778878709,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-12T05:11:38+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-2-klein-9b","type":"text-to-image","reported_type":"text-to-image","description":"The best quality-to-latency ratio, production apps model of the Flux 2 family. Frontier visual intelligence — state-of-the-art image generation and editing from Black Forest Labs","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-2-klein-9b/cover_image.8f795bc60d87299a58d1ee102a9338098651cf3ce231afd3a3340981eba5d316.webp","tags":["openai","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":1.5,"default_width":1024,"default_height":1024,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-20T17:13:04+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-14B","type":"text-generation","reported_type":"text-generation","description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. ","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-14B/cover_image.1201f1b95d6039377e39f7e52777b9e669c17ee28df3cf695172dbf189089420.webp","tags":["openai","tools","reasoning","json","non-reasoning","structured-output","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.2e-05,"cents_per_output_token":2.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":40960,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-28T23:15:06+00:00","private":0,"is_partner":false},{"model_name":"Bria/erase","type":"text-to-image","reported_type":"text-to-image","description":"Bria Eraser enables precise removal of unwanted objects from images while maintaining high-quality outputs. Trained exclusively on licensed data for safe and risk-free commercial use","cover_img_url":"https://shared.deepinfra.com/models/Bria/erase/cover_image.33bc4fc44b84034eacbc417c64757680e93498ecab8a057d870ad689d251939a.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:36+00:00","private":0,"is_partner":true},{"model_name":"openai/whisper-medium.en","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without fine-tuning. The primary intended users of these models are AI researchers studying robustness, generalisation, and capabilities of the current model.","cover_img_url":"","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"time","cents_per_sec":0.05},"max_tokens":null,"replaced_by":"openai/whisper-large-v3","deprecated":1722034223,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-16T05:13:28+00:00","private":0,"is_partner":false},{"model_name":"Austism/chronos-hermes-13b-v2","type":"text-generation","reported_type":"text-generation","description":"This offers the imaginative writing style of chronos while still retaining coherency and being capable. Outputs are long and utilize exceptional prose. Supports a maxium context length of 4096. The model follows the Alpaca prompt format.","cover_img_url":"","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.3e-05,"cents_per_output_token":1.3e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"Gryphe/MythoMax-L2-13b","deprecated":1727971605,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2024-01-24T02:24:45+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Coder-480B-A35B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Qwen3-Coder-480B-A35B-Instruct is the Qwen3's most agentic code model, featuring Significant Performance on Agentic Coding, Agentic Browser-Use and other foundational coding tasks, achieving results comparable to Claude Sonnet.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Coder-480B-A35B-Instruct/cover_image.d28599e171a40010634df3f9888e4771b4a4029e1ce355338e50ceade04042f0.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":0.00016,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo","deprecated":1777575819,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-07-23T04:15:51+00:00","private":0,"is_partner":false},{"model_name":"stepfun-ai/Step-3.5-Flash","type":"text-generation","reported_type":"text-generation","description":"Step 3.5 Flash is an open-source reasoning model by StepFun with 196B total parameters (11B active) using Mixture of Experts. It features a 256K context window, deep reasoning, tool calling, and agentic capabilities, achieving 97.3 on AIME 2025 and 74.4% on SWE-bench Verified.","cover_img_url":"https://shared.deepinfra.com/models/stepfun-ai/Step-3.5-Flash/cover_image.d2653a5d337f32f1fdf401c53e81cd6cd1a67416d96a28cca46b464fdacdca5d.webp","tags":["openai","tools","non-reasoning","priority","featured"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":9e-06,"cents_per_output_token":3e-05,"rate_per_input_token_cached":0.22222222,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":262144,"replaced_by":"stepfun-ai/Step-3.7-Flash","deprecated":1782760702,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2026-03-24T06:47:32+00:00","private":0,"is_partner":false},{"model_name":"meta-llama/Meta-Llama-3.1-8B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Meta developed and released the Meta Llama 3.1 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8B, 70B and 405B sizes","cover_img_url":"https://shared.deepinfra.com/models/meta-llama/Meta-Llama-3.1-8B-Instruct/cover_image.bf9600ae0ac92c7da76bb908f82c1fb1ab55c510b637fcc2454e368c7bd1cb72.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-06,"cents_per_output_token":5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"bfloat16","mmlu":47.0,"expected":null,"create_ts":"2024-07-23T17:54:23+00:00","private":0,"is_partner":false},{"model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-70B","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks. ","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/cover_image.8390f5b9b1330c345b923226e42e74d21c4002336df11becb397a55e3e828958.webp","tags":["json","openai","non-reasoning","reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-05,"cents_per_output_token":8e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"deepseek-ai/DeepSeek-R1-0528","deprecated":1781220528,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-01-20T22:10:25+00:00","private":0,"is_partner":false},{"model_name":"thenlper/gte-base","type":"embeddings","reported_type":"embeddings","description":"The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer three different sizes of models, including GTE-large, GTE-base, and GTE-small. The GTE models are trained on a large-scale corpus of relevance text pairs, covering a wide range of domains and scenarios. This enables the GTE models to be applied to various downstream tasks of text embeddings, including information retrieval, semantic textual similarity, text reranking, etc.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T20:07:29+00:00","private":0,"is_partner":false},{"model_name":"nvidia/Nemotron-3.5-ASR-Streaming-Multilingual-0.6b","type":"automatic-speech-recognition","reported_type":"automatic-speech-recognition","description":"Nemotron 3.5 ASR Streaming Multilingual is an open 0.6B-parameter prompt-conditioned cache-aware FastConformer-RNNT model, engineered for low-latency streaming transcription across 40+ languages. It powers real-time captioning, voice agents, and multilingual transcription pipelines—replacing separate per-language Whisper deployments with a single inference pass.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/Nemotron-3.5-ASR-Streaming-Multilingual-0.6b/cover_image.f8e5bf4eb79976dd5dbd79560a7c27d14354e582762f4c40b3697440f1057d70.webp","tags":["openai","multilingual","streaming"],"pricing":{"short":null,"full":null,"table":null,"type":"input_length","cents_per_input_sec":0.000333},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-06-03T12:07:11+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-Nemo-Instruct-2407","type":"text-generation","reported_type":"text-generation","description":"12B model trained jointly by Mistral AI and NVIDIA, it significantly outperforms existing models smaller or similar in size.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-Nemo-Instruct-2407/cover_image.353ef9274d841a75e1f3fcc18cec4bee3df17e8d931b5613492191651d2dd539.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2e-06,"cents_per_output_token":4e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2024-08-31T00:38:25+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-Coder-30B-A3B-Instruct","type":"text-generation","reported_type":"text-generation","description":"Qwen3-Coder-30B-A3B-Instruct is a high-performance code generation model optimized for agentic coding and complex programming tasks. With 30.5B total parameters and 3.3B activated through Mixture-of-Experts architecture, it delivers exceptional efficiency. The model features native support for 256K token context (extendable to 1M), making it ideal for repository-scale code understanding. It excels at tool calling, browser automation, and multi-step coding workflows.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-Coder-30B-A3B-Instruct/cover_image.e4306b2d313d34adebbcbb9ad6d1a265745570443cbb437c9c44b18ebb60068c.webp","tags":["openai","tools","json","structured_output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-06,"cents_per_output_token":2.6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"Qwen/Qwen3-30B-A3B","deprecated":1763168602,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-10-08T02:30:34+00:00","private":0,"is_partner":false},{"model_name":"intfloat/multilingual-e5-large","type":"embeddings","reported_type":"embeddings","description":"The Multilingual-E5-large model is a 24-layer text embedding model with an embedding size of 1024, trained on a mixture of multilingual datasets and supporting 100 languages.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":"fp32","mmlu":null,"expected":null,"create_ts":"2024-06-04T11:52:00+00:00","private":0,"is_partner":false},{"model_name":"microsoft/Phi-3-medium-4k-instruct","type":"text-generation","reported_type":"text-generation","description":"The Phi-3-Medium-4K-Instruct is a powerful and lightweight language model with 14 billion parameters, trained on high-quality data to excel in instruction following and safety measures. It demonstrates exceptional performance across benchmarks, including common sense, language understanding, and logical reasoning, outperforming models of similar size.","cover_img_url":"https://shared.deepinfra.com/models/microsoft/Phi-3-medium-4k-instruct/cover_image.6c7f810d3584719025ba43de13448d318bda84043a08af1b9718c61d9498b18c.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.4e-05,"cents_per_output_token":1.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Meta-Llama-3.1-8B-Instruct","deprecated":1726163265,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-13T01:08:24+00:00","private":0,"is_partner":false},{"model_name":"google/gemma-3-27b-it","type":"text-generation","reported_type":"text-generation","description":"Gemma 3 introduces multimodality, supporting vision-language input and text outputs. It handles context windows up to 128k tokens, understands over 140 languages, and offers improved math, reasoning, and chat capabilities, including structured outputs and function calling. Gemma 3 27B is Google's latest open source model, successor to Gemma 2","cover_img_url":"https://shared.deepinfra.com/models/google/gemma-3-27b-it/cover_image.acdca01c69780104ab35653f7dd78efa772e7fd70322b21a7a7bad10fcbb407c.webp","tags":["ocr","openai","tools","multimodal","json","non-reasoning","structured-output","cc-native","priority"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":8e-06,"cents_per_output_token":1.6e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":1.5},"max_tokens":131072,"replaced_by":null,"deprecated":null,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-03-12T21:30:48+00:00","private":0,"is_partner":false},{"model_name":"openchat/openchat-3.6-8b","type":"text-generation","reported_type":"text-generation","description":"Openchat 3.6 is a LLama-3-8b fine tune that outperforms it on multiple benchmarks.","cover_img_url":"https://shared.deepinfra.com/models/openchat/openchat-3.6-8b/cover_image.c5792073e4034a0847ff5112e00356adb411c4a5c900ed22c32ad65c5d97e8d1.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":8192,"replaced_by":"meta-llama/Meta-Llama-3.1-8B-Instruct","deprecated":1725662285,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-01T00:53:06+00:00","private":0,"is_partner":false},{"model_name":"intfloat/e5-base-v2","type":"embeddings","reported_type":"embeddings","description":"Text Embeddings by Weakly-Supervised Contrastive Pre-training. Model has 24 layers and 1024 out dim. ","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T20:17:35+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-pro","type":"text-to-image","reported_type":"text-to-image","description":"Black Forest Labs' first flagship model based on Flux latent rectified flow transformers","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-pro/cover_image.a1e96809cff343273e8dc70d17b7014ae2a524b75160b1dd68101127015ad940.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":5.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-10-11T23:28:43+00:00","private":0,"is_partner":true},{"model_name":"ByteDance/Seed-2.0-code","type":"text-generation","reported_type":"text-generation","description":"A coding model optimized for real-world development environments, with reliable tool use in common IDEs such as Claude Code. It delivers strong front-end performance and supports Skills.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seed-2.0-code/cover_image.2c28f96ec60c133595342c8b0c91e0ba811325e693a294b14e04dcf87f159aed.webp","tags":["reasoning","tools","structured-output","multimodal","no-free-anon","cc-native","json","openai"],"pricing":{"short":null,"full":"$0.50 in $3 out $0.10 cached <= 128K, $1 in $6 out $0.20 cached","table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-29T12:07:58+00:00","private":0,"is_partner":true},{"model_name":"nvidia/llama-nemotron-embed-vl-1b-v2","type":"embeddings","reported_type":"embeddings","description":"The llama-nemotron-embed-vl-1b-v2 is a high-performance multimodal embedding model designed to transform text queries and document images into dense vector representations for advanced retrieval systems. It excels at understanding complex visual content like charts, tables, and infographics.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/llama-nemotron-embed-vl-1b-v2/cover_image.0deeac95c09a3337a78eec5581adfcc84991a787274d6695760f738688733ad5.webp","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":10240,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-11T14:45:29+00:00","private":0,"is_partner":false},{"model_name":"intfloat/e5-large-v2","type":"embeddings","reported_type":"embeddings","description":"Text Embeddings by Weakly-Supervised Contrastive Pre-training. Model has 24 layers and 1024 out dim. ","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-11-22T19:59:57+00:00","private":0,"is_partner":false},{"model_name":"mistralai/Mistral-7B-Instruct-v0.3","type":"text-generation","reported_type":"text-generation","description":"Mistral-7B-Instruct-v0.3 is an instruction-tuned model, next iteration of of Mistral 7B that has larger vocabulary, newer tokenizer and supports function calling.","cover_img_url":"https://shared.deepinfra.com/models/mistralai/Mistral-7B-Instruct-v0.3/cover_image.716d64cdc98717436953bde0b80dede06d7071f66cb46f1df85caaa270e5cdd6.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.8e-06,"cents_per_output_token":5.4e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"mistralai/Mistral-Small-3.2-24B-Instruct-2506","deprecated":1757952363,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-01T00:55:05+00:00","private":0,"is_partner":false},{"model_name":"Qwen/Qwen3-235B-A22B","type":"text-generation","reported_type":"text-generation","description":"Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen3-235B-A22B/cover_image.011f77376c7d6752c9f56907c8fd353c3484a570d3d80f0d61e6fdc03379525b.webp","tags":["openai","json","tools","reasoning"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":1.8e-05,"cents_per_output_token":5.4e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":40960,"replaced_by":"Qwen/Qwen3-235B-A22B-Instruct-2507","deprecated":1760028990,"quantization":"fp8","mmlu":null,"expected":null,"create_ts":"2025-04-28T22:55:17+00:00","private":0,"is_partner":false},{"model_name":"PrunaAI/p-video-avatar","type":"text-to-video","reported_type":"text-to-video","description":"Pruna's talking head video generation model. Provide a portrait image and either a speech script or an audio file, and the model generates a realistic video of the person speaking. Supports multiple voices, languages, and output resolutions.","cover_img_url":"https://shared.deepinfra.com/models/PrunaAI/p-video-avatar/cover_image.e74c7c59e15f5458f2cb354dee1b086deeb49332d502c129e69dcc5651b88123.webp","tags":["no-free-anon"],"pricing":{"short":"$0.025 / second","full":"$0.025 / second for 720P, $0.045 / second for 1080P","table":null,"type":"output_length","cents_per_output_sec":2.25},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-30T03:35:25+00:00","private":0,"is_partner":true},{"model_name":"google/veo-3.0","type":"text-to-video","reported_type":"text-to-video","description":"Veo 3 is a state-of-the-art text-to-video model from Google that generates high-fidelity, cinematic videos with synchronized audio from a simple text prompt. It excels at creating realistic and imaginative scenes with a deep understanding of natural language and visual dynamics.","cover_img_url":"https://shared.deepinfra.com/models/google/veo-3.0/cover_image.4817ee203a8eeda7540d449add88546a2eb85711c5704f0451b5b9e28cae146e.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":40.0},"max_tokens":null,"replaced_by":"google/veo-3.1","deprecated":1779305233,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-08-07T14:50:09+00:00","private":0,"is_partner":true},{"model_name":"ResembleAI/chatterbox-turbo","type":"text-to-speech","reported_type":"text-to-speech","description":"Chatterbox is a family of three state-of-the-art, open-source text-to-speech models by Resemble AI.  We are excited to introduce Chatterbox-Turbo, our most efficient model yet. Built on a streamlined 350M parameter architecture, Turbo delivers high-quality speech with less compute and VRAM than our previous models. We have also distilled the speech-token-to-mel decoder, previously a bottleneck, reducing generation from 10 steps to just one, while retaining high-fidelity audio output.  Paralinguistic tags are now native to the Turbo model, allowing you to use [cough], [laugh], [chuckle], and more to add distinct realism. While Turbo was built primarily for low-latency voice agents, it excels at narration and creative workflows.  If you like the model but need to scale or tune it for higher accuracy, check out our competitively priced TTS service (link).","cover_img_url":"https://shared.deepinfra.com/models/ResembleAI/chatterbox-turbo/cover_image.659bc167c86ba8e73af83263bc0de69ae2128cdcd3670c2126352896ae3918b4.webp","tags":["voice","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0001},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-12-22T19:05:44+00:00","private":0,"is_partner":false},{"model_name":"stabilityai/sd3.5-medium","type":"text-to-image","reported_type":"text-to-image","description":"  At 2.5 billion parameters, with improved MMDiT-X architecture and training methods, this model is designed to run “out of the box” on consumer hardware, striking a balance between quality and ease of customization. It is capable of generating images ranging between 0.25 and 2 megapixel resolution. ","cover_img_url":"https://shared.deepinfra.com/models/stabilityai/sd3.5-medium/cover_image.767164fa20bda210e066e17f3f1840cde9376aedd8666227a762a5c3315d82bd.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":3.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":"stabilityai/sdxl-turbo","deprecated":1761252487,"quantization":"bf16","mmlu":null,"expected":null,"create_ts":"2024-10-29T09:20:49+00:00","private":0,"is_partner":false},{"model_name":"ByteDance/Seed-2.0-pro","type":"text-generation","reported_type":"text-generation","description":"Built for the Agent era, it delivers stable performance in complex reasoning and long-horizon tasks, including multi-step planning, visual-text reasoning, video understanding, and advanced analysis.","cover_img_url":"https://shared.deepinfra.com/models/ByteDance/Seed-2.0-pro/cover_image.a92fa66e5a544f52cfef420dd09a68947530ae0208be1a47cb995c071c48040b.webp","tags":["reasoning","cc-native","openai","tools","structured-output","multimodal","no-free-anon","json"],"pricing":{"short":null,"full":"$0.50 in $3 out $0.10 cached <= 128K, $1 in $6 out $0.20 cached","table":null,"type":"tokens","cents_per_input_token":5e-05,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.2,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":256000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-01T11:40:51+00:00","private":0,"is_partner":true},{"model_name":"Qwen/Qwen2-7B-Instruct","type":"text-generation","reported_type":"text-generation","description":"The 7 billion parameter Qwen2 excels in language understanding, multilingual capabilities, coding, mathematics, and reasoning.","cover_img_url":"https://shared.deepinfra.com/models/Qwen/Qwen2-7B-Instruct/cover_image.e8c2257ba46edbd20ef17c81343ad40fdf27f7bc3838837d079557accd2879e2.webp","tags":["openai","tools"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":5.5e-06,"cents_per_output_token":5.5e-06,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":32768,"replaced_by":"Qwen/Qwen3-14B","deprecated":1762392581,"quantization":"bfloat16","mmlu":null,"expected":null,"create_ts":"2024-06-17T19:00:48+00:00","private":0,"is_partner":false},{"model_name":"deepinfra/airoboros-70b","type":"text-generation","reported_type":"text-generation","description":"Latest version of the Airoboros model fine-tunned version of llama-2-70b using the Airoboros dataset. This model is currently running jondurbin/airoboros-l2-70b-2.2.1 ","cover_img_url":"https://shared.deepinfra.com/models/deepinfra/airoboros-70b/cover_image.4df6a78233488497ec8cad9a032e070a1c2e0c510c7d3b2ebd5c159e15b69793.webp","tags":["openai","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":7e-05,"cents_per_output_token":9e-05,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":4096,"replaced_by":"meta-llama/Llama-3.3-70B-Instruct-Turbo","deprecated":1762392581,"quantization":"fp16","mmlu":null,"expected":null,"create_ts":"2023-10-14T01:08:35+00:00","private":0,"is_partner":false},{"model_name":"Bria/enhance","type":"text-to-image","reported_type":"text-to-image","description":"Bria Enhance improves overall image quality by sharpening details, balancing colors, and boosting clarity for crisp, professional visuals. Trained only on licensed data, it’s safe, reliable, and ready for commercial use.","cover_img_url":"https://shared.deepinfra.com/models/Bria/enhance/cover_image.8477b79997205549321737b608f2c2a73bc1f54b6e3b47568af7db9f58614fa5.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-09-01T13:41:54+00:00","private":0,"is_partner":true},{"model_name":"moonshotai/Kimi-K2.5-Turbo","type":"text-generation","reported_type":"text-generation","description":"","cover_img_url":"","tags":["openai","tools","reasoning","json","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":6e-05,"cents_per_output_token":0.0003,"rate_per_input_token_cached":0.16666667,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":262144,"replaced_by":"moonshotai/Kimi-K2.5","deprecated":1776285488,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-03-06T23:02:27+00:00","private":0,"is_partner":false},{"model_name":"canopylabs/orpheus-3b-0.1-ft","type":"text-to-speech","reported_type":"text-to-speech","description":"Orpheus TTS is a state-of-the-art, Llama-based Speech-LLM designed for high-quality, empathetic text-to-speech generation. This model has been finetuned to deliver human-level speech synthesis, achieving exceptional clarity, expressiveness, and real-time streaming performances.","cover_img_url":"https://shared.deepinfra.com/models/canopylabs/orpheus-3b-0.1-ft/cover_image.7d9b9bff64d8b37db243f67c58734813839e2ce93ebe1d692e9aff3a0a3e01cf.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"input_character_length","cents_per_input_chars":0.0007},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-03-25T10:05:56+00:00","private":0,"is_partner":false},{"model_name":"google/veo-3.1","type":"text-to-video","reported_type":"text-to-video","description":"Veo 3.1 is the latest text-to-video model from Google that generates high-fidelity, cinematic videos with synchronized audio from a simple text prompt. It excels at creating realistic and imaginative scenes with a deep understanding of natural language and visual dynamics.","cover_img_url":"https://shared.deepinfra.com/models/google/veo-3.1/cover_image.7e5c95f84a02e69592af61e2ac601aced4d8e64819c9c1276c20eb3414242cbb.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"output_length","cents_per_output_sec":40.0},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-10-16T12:59:44+00:00","private":0,"is_partner":true},{"model_name":"sentence-transformers/all-MiniLM-L12-v2","type":"embeddings","reported_type":"embeddings","description":"We present a sentence transformation model that generates semantically similar sentences. Our model is based on the Sentence-Transformers architecture and was trained on a large dataset of sentence pairs. We evaluate the effectiveness of our model by measuring its ability to generate similar sentences that are close to the original sentence in meaning.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-02-28T17:51:44+00:00","private":0,"is_partner":false},{"model_name":"black-forest-labs/FLUX-1.1-pro","type":"text-to-image","reported_type":"text-to-image","description":"Black Forest Labs' latest state-of-the art proprietary model sporting top of the line prompt following, visual quality, details and output diversity.","cover_img_url":"https://shared.deepinfra.com/models/black-forest-labs/FLUX-1.1-pro/cover_image.1d9b99e7fdccdaebc20319d69512c2d2acc632f7ded8d0b19ecadeee596955df.webp","tags":["no-free-anon","openai"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":4.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2024-10-11T22:50:12+00:00","private":0,"is_partner":true},{"model_name":"BAAI/bge-m3-multi","type":"embeddings","reported_type":"embeddings","description":"BGE-M3 is a multilingual text embedding model developed by BAAI, distinguished by its Multi-Linguality (supporting 100+ languages), Multi-Functionality (unified dense, multi-vector, and sparse retrieval), and Multi-Granularity (handling inputs from short queries to 8192-token documents). It achieves state-of-the-art retrieval performance across diverse benchmarks while maintaining a single model for multiple retrieval modes.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":8192,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2025-02-28T15:42:02+00:00","private":0,"is_partner":false},{"model_name":"Wan-AI/Wan2.7-Image-Edit","type":"text-to-image","reported_type":"text-to-image","description":"Bleeding edge image model supporting generation and editing, text to image, text/image to sequential images, image editing, multi-image reference generation, and interactive editing. Delivers enhanced performance in text rendering, subject consistency, and complex instruction following","cover_img_url":"https://shared.deepinfra.com/models/Wan-AI/Wan2.7-Image-Edit/cover_image.705345137ca13a7b4bf3a2a973d281f49c97c39b1cea06ae69461a421995e05d.webp","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":3.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-04-01T13:51:18+00:00","private":0,"is_partner":true},{"model_name":"sentence-transformers/paraphrase-MiniLM-L6-v2","type":"embeddings","reported_type":"embeddings","description":"We present a sentence similarity model based on the Sentence Transformers architecture, which maps sentences to a 384-dimensional dense vector space. The model uses a pre-trained BERT encoder and applies mean pooling on top of the contextualized word embeddings to obtain sentence embeddings. We evaluate the model on the Sentence Embeddings Benchmark.","cover_img_url":"","tags":["openai"],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":5e-07},"max_tokens":512,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2023-03-03T02:57:50+00:00","private":0,"is_partner":false},{"model_name":"nvidia/llama-nemotron-rerank-vl-1b-v2","type":"reranker","reported_type":"reranker","description":"The llama-nemotron-rerank-vl-1b-v2 is a 1.7B parameter multimodal reranking model designed to evaluate and order the relevance of document images and text against specific user queries. It excels at understanding complex visual content like charts, tables, and infographics.","cover_img_url":"https://shared.deepinfra.com/models/nvidia/llama-nemotron-rerank-vl-1b-v2/cover_image.0deeac95c09a3337a78eec5581adfcc84991a787274d6695760f738688733ad5.webp","tags":[],"pricing":{"short":null,"full":null,"table":null,"type":"input_tokens","cents_per_input_token":1e-06},"max_tokens":10240,"replaced_by":null,"deprecated":null,"quantization":"bf16","mmlu":null,"expected":null,"create_ts":"2026-03-12T10:39:35+00:00","private":0,"is_partner":false},{"model_name":"moonshotai/Kimi-K2-Instruct-0905","type":"text-generation","reported_type":"text-generation","description":"Kimi K2 0905 is the September update of Kimi K2 0711. It is a large-scale Mixture-of-Experts (MoE) language model developed by Moonshot AI, featuring 1 trillion total parameters with 32 billion active per forward pass. It supports long-context inference up to 256k tokens, extended from the previous 128k.  This update improves agentic coding with higher accuracy and better generalization across scaffolds, and enhances frontend coding with more aesthetic and functional outputs for web, 3D, and related tasks. Kimi K2 is optimized for agentic capabilities, including advanced tool use, reasoning, and code synthesis. It excels across coding (LiveCodeBench, SWE-bench), reasoning (ZebraLogic, GPQA), and tool-use (Tau2, AceBench) benchmarks. The model is trained with a novel stack incorporating the MuonClip optimizer for stable large-scale MoE training.","cover_img_url":"https://shared.deepinfra.com/models/moonshotai/Kimi-K2-Instruct-0905/cover_image.18141caabe56debf6a63c63affe8d47959efec30be121bd7292d63652ac63a65.webp","tags":["openai","tools","json","non-reasoning","structured-output"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":4e-05,"cents_per_output_token":0.0002,"rate_per_input_token_cached":0.375,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":131072,"replaced_by":"moonshotai/Kimi-K2.5","deprecated":1776286096,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-09-06T07:12:48+00:00","private":0,"is_partner":false},{"model_name":"ClarityAI/flux","type":"text-to-image","reported_type":"text-to-image","description":"ClarityAI/flux integrates the Flux AI model into the upscaling process, enabling high-resolution enhancements with superior face preservation and support for LoRAs to apply specific styles or identities.","cover_img_url":"","tags":["no-free-anon"],"pricing":{"short":null,"full":null,"table":null,"type":"image_units","cents_per_image_unit":20.0,"default_width":0,"default_height":0,"default_iterations":0,"default_price_cents":null,"usage_from_cost":false},"max_tokens":null,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-01-26T12:49:46+00:00","private":0,"is_partner":true},{"model_name":"google/gemini-3.1-pro","type":"text-generation","reported_type":"text-generation","description":"Bring any idea to life with state-of-the-art reasoning to help you learn, build, and plan anything. Best for complex tasks and bringing creative concepts to life.","cover_img_url":"https://shared.deepinfra.com/models/google/gemini-3.1-pro/cover_image.5540fa83e6f0b6aaa27850abbc52202f6bb81a007fc8eec0510a41190e0a7601.webp","tags":["json","openai","multimodal","no-free-anon","tools","ocr","reasoning","structured-output","cc-native"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":0.0002,"cents_per_output_token":0.0012,"rate_per_input_token_cached":null,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":1000000,"replaced_by":null,"deprecated":null,"quantization":null,"mmlu":null,"expected":null,"create_ts":"2026-05-12T15:26:34+00:00","private":0,"is_partner":true},{"model_name":"deepseek-ai/DeepSeek-V3.2-Exp","type":"text-generation","reported_type":"text-generation","description":"DeepSeek-V3.2-Exp is an intermediate step toward the next-generation architecture of the DeepSeek models by introducing DeepSeek Sparse Attention—a sparse attention mechanism designed to explore and validate optimizations for training and inference efficiency in long-context scenarios.","cover_img_url":"https://shared.deepinfra.com/models/deepseek-ai/DeepSeek-V3.2-Exp/cover_image.bc51ec1aeec1e0cdee1e6a2619856d46ad912a0c9c3e11452e9f1126e8a15305.webp","tags":["openai","tools","reasoning","can-disable-reasoning","structured-output","json"],"pricing":{"short":null,"full":null,"table":null,"type":"tokens","cents_per_input_token":2.1e-05,"cents_per_output_token":3.2e-05,"rate_per_input_token_cached":0.8,"rate_per_input_token_cache_write":null,"rate_per_service_tier_priority":null},"max_tokens":163840,"replaced_by":"deepseek-ai/DeepSeek-V3.2","deprecated":1765415912,"quantization":"fp4","mmlu":null,"expected":null,"create_ts":"2025-09-29T23:43:51+00:00","private":0,"is_partner":false}]