feat: Add support for counting cache-hit tokens in llama.cpp OpenAI-Compatible API

This commit is contained in:
wenyifan 2026-03-20 16:10:18 +08:00
parent ed6ff0f267
commit ff29900f30

View File

@ -627,6 +627,12 @@ func applyUsagePostProcessing(info *relaycommon.RelayInfo, usage *dto.Usage, res
usage.PromptTokensDetails.CachedTokens = usage.PromptCacheHitTokens
}
}
case constant.ChannelTypeOpenAI:
if usage.PromptTokensDetails.CachedTokens == 0 {
if cachedTokens, ok := extractLlamaCachedTokensFromBody(responseBody); ok {
usage.PromptTokensDetails.CachedTokens = cachedTokens
}
}
}
}
@ -689,3 +695,21 @@ func extractMoonshotCachedTokensFromBody(body []byte) (int, bool) {
return 0, false
}
// extractLlamaCachedTokensFromBody 从llama.cpp的非标准位置提取cache_n
func extractLlamaCachedTokensFromBody(body []byte) (int, bool) {
if len(body) == 0 {
return 0, false
}
var payload struct {
Usage struct {
CachedTokens *int `json:"cache_n"`
} `json:"timings"`
}
if err := common.Unmarshal(body, &payload); err != nil {
return 0, false
}
return *payload.Usage.CachedTokens, true
}