You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
memos/internal/ai/stt/stt.go

42 lines
1.3 KiB
Go

// Package stt defines the speech-to-text capability for AI providers.
// Implementations call dedicated STT endpoints (e.g. OpenAI /audio/transcriptions)
// and return deterministic transcription output. For multimodal LLMs that
// happen to accept audio input, see internal/ai/audiollm.
package stt
import (
"context"
"io"
)
// Transcriber transcribes audio to text using a provider's dedicated STT endpoint.
type Transcriber interface {
Transcribe(ctx context.Context, req Request) (*Response, error)
}
// Request is the input to a transcription call.
type Request struct {
Audio io.Reader
Size int64
Filename string
ContentType string // IANA media type, e.g. "audio/wav"
Model string // provider-specific model id (e.g. "whisper-1", "gpt-4o-transcribe")
Prompt string // soft spelling/vocabulary hint (Whisper "prompt" parameter)
Language string // ISO 639-1, optional
}
// Response is the output of a transcription call.
type Response struct {
Text string
Language string // empty if provider did not return it
Segments []Segment // empty unless provider returned timestamps
}
// Segment is a timestamped portion of the transcript.
type Segment struct {
Text string
Start float64
End float64
Speaker string // empty unless using a diarization-capable model
}