new-api/dto/audio.go

package dto

import (
	"encoding/json"
	"strings"

	"github.com/QuantumNous/new-api/types"

	"github.com/gin-gonic/gin"
)

type AudioRequest struct {
	Model          string          `json:"model"`
	Input          string          `json:"input"`
	Voice          string          `json:"voice"`
	Instructions   string          `json:"instructions,omitempty"`
	ResponseFormat string          `json:"response_format,omitempty"`
	Speed          *float64        `json:"speed,omitempty"`
	StreamFormat   string          `json:"stream_format,omitempty"`
	Metadata       json.RawMessage `json:"metadata,omitempty"`
	// vllm-omini
	TaskType                json.RawMessage `json:"task_type,omitempty"`
	Language                json.RawMessage `json:"language,omitempty"`
	RefAudio                json.RawMessage `json:"ref_audio,omitempty"`
	RefText                 json.RawMessage `json:"ref_text,omitempty"`
	XVectorOnlyMode         json.RawMessage `json:"x_vector_only_mode,omitempty"`
	MaxNewTokens            json.RawMessage `json:"max_new_tokens,omitempty"`
	InitialCodecChunkFrames json.RawMessage `json:"initial_codec_chunk_frames,omitempty"`
	// TODO：ensure that the logic remains correct after the stream is started.
	//Stream                  json.RawMessage `json:"stream,omitempty"`
}

func (r *AudioRequest) GetTokenCountMeta() *types.TokenCountMeta {
	meta := &types.TokenCountMeta{
		CombineText: r.Input,
		TokenType:   types.TokenTypeTextNumber,
	}
	if strings.Contains(r.Model, "gpt") {
		meta.TokenType = types.TokenTypeTokenizer
	}
	return meta
}

func (r *AudioRequest) IsStream(c *gin.Context) bool {
	return r.StreamFormat == "sse"
}

func (r *AudioRequest) SetModelName(modelName string) {
	if modelName != "" {
		r.Model = modelName
	}
}

type AudioResponse struct {
	Text string `json:"text"`
}

type WhisperVerboseJSONResponse struct {
	Task     string    `json:"task,omitempty"`
	Language string    `json:"language,omitempty"`
	Duration float64   `json:"duration,omitempty"`
	Text     string    `json:"text,omitempty"`
	Segments []Segment `json:"segments,omitempty"`
}

type Segment struct {
	Id               int     `json:"id"`
	Seek             int     `json:"seek"`
	Start            float64 `json:"start"`
	End              float64 `json:"end"`
	Text             string  `json:"text"`
	Tokens           []int   `json:"tokens"`
	Temperature      float64 `json:"temperature"`
	AvgLogprob       float64 `json:"avg_logprob"`
	CompressionRatio float64 `json:"compression_ratio"`
	NoSpeechProb     float64 `json:"no_speech_prob"`
}
-												feat: 初步重构完成

											
										
										
											2024-02-29 16:21:25 +08:00
+								package dto
-												refactor: Introduce pre-consume quota and unify relay handlers

This commit introduces a major architectural refactoring to improve quota management, centralize logging, and streamline the relay handling logic.

Key changes:
- **Pre-consume Quota:** Implements a new mechanism to check and reserve user quota *before* making the request to the upstream provider. This ensures more accurate quota deduction and prevents users from exceeding their limits due to concurrent requests.

- **Unified Relay Handlers:** Refactors the relay logic to use generic handlers (e.g., `ChatHandler`, `ImageHandler`) instead of provider-specific implementations. This significantly reduces code duplication and simplifies adding new channels.

- **Centralized Logger:** A new dedicated `logger` package is introduced, and all system logging calls are migrated to use it, moving this responsibility out of the `common` package.

- **Code Reorganization:** DTOs are generalized (e.g., `dalle.go` -> `openai_image.go`) and utility code is moved to more appropriate packages (e.g., `common/http.go` -> `service/http.go`) for better code structure.

											
										
										
											2025-08-14 20:05:06 +08:00
+								import (
-												feat: AudioRequest add metadata support custom params

											
										
										
											2025-10-18 01:13:54 +08:00
+									"encoding/json"
-												feat(audio): enhance audio request handling with token type detection and streaming support

											
										
										
											2025-12-13 17:24:23 +08:00
+									"strings"
-												feat: AudioRequest add metadata support custom params

											
										
										
											2025-10-18 01:13:54 +08:00
-												format: package name -> github.com/QuantumNous/new-api (#2017)


											
										
										
											2025-10-11 15:30:09 +08:00
+									"github.com/QuantumNous/new-api/types"
-												refactor: Introduce pre-consume quota and unify relay handlers

This commit introduces a major architectural refactoring to improve quota management, centralize logging, and streamline the relay handling logic.

Key changes:
- **Pre-consume Quota:** Implements a new mechanism to check and reserve user quota *before* making the request to the upstream provider. This ensures more accurate quota deduction and prevents users from exceeding their limits due to concurrent requests.

- **Unified Relay Handlers:** Refactors the relay logic to use generic handlers (e.g., `ChatHandler`, `ImageHandler`) instead of provider-specific implementations. This significantly reduces code duplication and simplifies adding new channels.

- **Centralized Logger:** A new dedicated `logger` package is introduced, and all system logging calls are migrated to use it, moving this responsibility out of the `common` package.

- **Code Reorganization:** DTOs are generalized (e.g., `dalle.go` -> `openai_image.go`) and utility code is moved to more appropriate packages (e.g., `common/http.go` -> `service/http.go`) for better code structure.

											
										
										
											2025-08-14 20:05:06 +08:00
 									"github.com/gin-gonic/gin"
 								)
-												refactor: audio relay

											
										
										
											2024-07-16 22:07:10 +08:00
+								type AudioRequest struct {
-												feat: AudioRequest add metadata support custom params

											
										
										
											2025-10-18 01:13:54 +08:00
+									Model          string          `json:"model"`
 									Input          string          `json:"input"`
 									Voice          string          `json:"voice"`
 									Instructions   string          `json:"instructions,omitempty"`
 									ResponseFormat string          `json:"response_format,omitempty"`
-												fix: preserve explicit zero values in native relay requests

											
										
										
											2026-03-01 15:47:03 +08:00
+									Speed          *float64        `json:"speed,omitempty"`
-												feat: AudioRequest add metadata support custom params

											
										
										
											2025-10-18 01:13:54 +08:00
+									StreamFormat   string          `json:"stream_format,omitempty"`
 									Metadata       json.RawMessage `json:"metadata,omitempty"`
-												feat: fill in some custom fields for vllm-omini.

											
										
										
											2026-04-09 12:41:51 +08:00
+									// vllm-omini
 									TaskType                json.RawMessage `json:"task_type,omitempty"`
 									Language                json.RawMessage `json:"language,omitempty"`
 									RefAudio                json.RawMessage `json:"ref_audio,omitempty"`
 									RefText                 json.RawMessage `json:"ref_text,omitempty"`
 									XVectorOnlyMode         json.RawMessage `json:"x_vector_only_mode,omitempty"`
 									MaxNewTokens            json.RawMessage `json:"max_new_tokens,omitempty"`
 									InitialCodecChunkFrames json.RawMessage `json:"initial_codec_chunk_frames,omitempty"`
 									// TODO：ensure that the logic remains correct after the stream is started.
 									//Stream                  json.RawMessage `json:"stream,omitempty"`
-												feat: 初步重构完成

											
										
										
											2024-02-29 16:21:25 +08:00
+								}
-												refactor: Introduce pre-consume quota and unify relay handlers

This commit introduces a major architectural refactoring to improve quota management, centralize logging, and streamline the relay handling logic.

Key changes:
- **Pre-consume Quota:** Implements a new mechanism to check and reserve user quota *before* making the request to the upstream provider. This ensures more accurate quota deduction and prevents users from exceeding their limits due to concurrent requests.

- **Unified Relay Handlers:** Refactors the relay logic to use generic handlers (e.g., `ChatHandler`, `ImageHandler`) instead of provider-specific implementations. This significantly reduces code duplication and simplifies adding new channels.

- **Centralized Logger:** A new dedicated `logger` package is introduced, and all system logging calls are migrated to use it, moving this responsibility out of the `common` package.

- **Code Reorganization:** DTOs are generalized (e.g., `dalle.go` -> `openai_image.go`) and utility code is moved to more appropriate packages (e.g., `common/http.go` -> `service/http.go`) for better code structure.

											
										
										
											2025-08-14 20:05:06 +08:00
+								func (r *AudioRequest) GetTokenCountMeta() *types.TokenCountMeta {
 									meta := &types.TokenCountMeta{
 										CombineText: r.Input,
 										TokenType:   types.TokenTypeTextNumber,
 									}
-												feat(audio): enhance audio request handling with token type detection and streaming support

											
										
										
											2025-12-13 17:24:23 +08:00
+									if strings.Contains(r.Model, "gpt") {
 										meta.TokenType = types.TokenTypeTokenizer
 									}
-												refactor: Introduce pre-consume quota and unify relay handlers

This commit introduces a major architectural refactoring to improve quota management, centralize logging, and streamline the relay handling logic.

Key changes:
- **Pre-consume Quota:** Implements a new mechanism to check and reserve user quota *before* making the request to the upstream provider. This ensures more accurate quota deduction and prevents users from exceeding their limits due to concurrent requests.

- **Unified Relay Handlers:** Refactors the relay logic to use generic handlers (e.g., `ChatHandler`, `ImageHandler`) instead of provider-specific implementations. This significantly reduces code duplication and simplifies adding new channels.

- **Centralized Logger:** A new dedicated `logger` package is introduced, and all system logging calls are migrated to use it, moving this responsibility out of the `common` package.

- **Code Reorganization:** DTOs are generalized (e.g., `dalle.go` -> `openai_image.go`) and utility code is moved to more appropriate packages (e.g., `common/http.go` -> `service/http.go`) for better code structure.

											
										
										
											2025-08-14 20:05:06 +08:00
+									return meta
 								}
 								func (r *AudioRequest) IsStream(c *gin.Context) bool {
-												feat(audio): enhance audio request handling with token type detection and streaming support

											
										
										
											2025-12-13 17:24:23 +08:00
+									return r.StreamFormat == "sse"
-												refactor: Introduce pre-consume quota and unify relay handlers

This commit introduces a major architectural refactoring to improve quota management, centralize logging, and streamline the relay handling logic.

Key changes:
- **Pre-consume Quota:** Implements a new mechanism to check and reserve user quota *before* making the request to the upstream provider. This ensures more accurate quota deduction and prevents users from exceeding their limits due to concurrent requests.

- **Unified Relay Handlers:** Refactors the relay logic to use generic handlers (e.g., `ChatHandler`, `ImageHandler`) instead of provider-specific implementations. This significantly reduces code duplication and simplifies adding new channels.

- **Centralized Logger:** A new dedicated `logger` package is introduced, and all system logging calls are migrated to use it, moving this responsibility out of the `common` package.

- **Code Reorganization:** DTOs are generalized (e.g., `dalle.go` -> `openai_image.go`) and utility code is moved to more appropriate packages (e.g., `common/http.go` -> `service/http.go`) for better code structure.

											
										
										
											2025-08-14 20:05:06 +08:00
+								}
-												feat: 修复重试后请求结构混乱，修复rerank端点无法使用

											
										
										
											2025-08-23 13:12:15 +08:00
+								func (r *AudioRequest) SetModelName(modelName string) {
 									if modelName != "" {
 										r.Model = modelName
 									}
 								}
-												feat: 初步重构完成

											
										
										
											2024-02-29 16:21:25 +08:00
+								type AudioResponse struct {
 									Text string `json:"text"`
 								}
-												refactor: audio relay

											
										
										
											2024-07-16 22:07:10 +08:00
 								type WhisperVerboseJSONResponse struct {
 									Task     string    `json:"task,omitempty"`
 									Language string    `json:"language,omitempty"`
 									Duration float64   `json:"duration,omitempty"`
 									Text     string    `json:"text,omitempty"`
 									Segments []Segment `json:"segments,omitempty"`
 								}
 								type Segment struct {
 									Id               int     `json:"id"`
 									Seek             int     `json:"seek"`
 									Start            float64 `json:"start"`
 									End              float64 `json:"end"`
 									Text             string  `json:"text"`
 									Tokens           []int   `json:"tokens"`
 									Temperature      float64 `json:"temperature"`
 									AvgLogprob       float64 `json:"avg_logprob"`
 									CompressionRatio float64 `json:"compression_ratio"`
 									NoSpeechProb     float64 `json:"no_speech_prob"`
 								}