feat(ai): add BYOK audio transcription (#5832)

2 months ago · 101704c8ea
parent 0ad0fec8d4
commit 101704c8ea
29 changed files with 564 additions and 520 deletions
--- a/internal/ai/ai.go
+++ b/internal/ai/ai.go
@ -6,12 +6,8 @@ type ProviderType string
 const (
 	// ProviderOpenAI is OpenAI's hosted API.
 	ProviderOpenAI ProviderType = "OPENAI"
-	// ProviderOpenAICompatible is an OpenAI-compatible API endpoint.
-	ProviderOpenAICompatible ProviderType = "OPENAI_COMPATIBLE"
 	// ProviderGemini is Google's Gemini API.
 	ProviderGemini ProviderType = "GEMINI"
-	// ProviderAnthropic is Anthropic's API.
-	ProviderAnthropic ProviderType = "ANTHROPIC"
 )

 // ProviderConfig configures a callable AI provider connection.
@ -21,6 +17,4 @@ type ProviderConfig struct {
 	Type     ProviderType
 	Endpoint string
 	APIKey   string
-	Models       []string
-	DefaultModel string
 }
--- a/internal/ai/models.go
+++ b/internal/ai/models.go
@ -0,0 +1,22 @@
+package ai
+
+import "github.com/pkg/errors"
+
+const (
+	// DefaultOpenAITranscriptionModel is the built-in OpenAI transcription model.
+	DefaultOpenAITranscriptionModel = "gpt-4o-transcribe"
+	// DefaultGeminiTranscriptionModel is the built-in Gemini transcription model.
+	DefaultGeminiTranscriptionModel = "gemini-2.5-flash"
+)
+
+// DefaultTranscriptionModel returns the built-in transcription model for a provider.
+func DefaultTranscriptionModel(providerType ProviderType) (string, error) {
+	switch providerType {
+	case ProviderOpenAI:
+		return DefaultOpenAITranscriptionModel, nil
+	case ProviderGemini:
+		return DefaultGeminiTranscriptionModel, nil
+	default:
+		return "", errors.Wrapf(ErrCapabilityUnsupported, "provider type %q", providerType)
+	}
+}
--- a/proto/api/v1/ai_service.proto
+++ b/proto/api/v1/ai_service.proto
@ -31,14 +31,11 @@ message TranscribeRequest {
 }

 message TranscriptionConfig {
-  // Optional. The model to use. If empty, the provider's default model is used.
-  string model = 1 [(google.api.field_behavior) = OPTIONAL];
-
  // Optional. A prompt to improve transcription quality.
-  string prompt = 2 [(google.api.field_behavior) = OPTIONAL];
+  string prompt = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The language of the input audio.
-  string language = 3 [(google.api.field_behavior) = OPTIONAL];
+  string language = 2 [(google.api.field_behavior) = OPTIONAL];
 }

 message TranscriptionAudio {
--- a/proto/api/v1/instance_service.proto
+++ b/proto/api/v1/instance_service.proto
@ -219,8 +219,6 @@ message InstanceSetting {
    string endpoint = 4;
    // api_key is write-only and is never returned by GetInstanceSetting.
    string api_key = 5 [(google.api.field_behavior) = INPUT_ONLY];
-    repeated string models = 6;
-    string default_model = 7;
    // api_key_set indicates whether an API key is stored for this provider.
    bool api_key_set = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
    // api_key_hint is a masked hint for the stored API key.
@ -231,9 +229,7 @@ message InstanceSetting {
  enum AIProviderType {
    AI_PROVIDER_TYPE_UNSPECIFIED = 0;
    OPENAI = 1;
-    OPENAI_COMPATIBLE = 2;
-    GEMINI = 3;
-    ANTHROPIC = 4;
+    GEMINI = 2;
  }
 }

--- a/proto/gen/api/v1/ai_service.pb.go
+++ b/proto/gen/api/v1/ai_service.pb.go
@ -87,12 +87,10 @@ func (x *TranscribeRequest) GetAudio() *TranscriptionAudio {

 type TranscriptionConfig struct {
 	state protoimpl.MessageState `protogen:"open.v1"`
-	// Optional. The model to use. If empty, the provider's default model is used.
-	Model string `protobuf:"bytes,1,opt,name=model,proto3" json:"model,omitempty"`
 	// Optional. A prompt to improve transcription quality.
-	Prompt string `protobuf:"bytes,2,opt,name=prompt,proto3" json:"prompt,omitempty"`
+	Prompt string `protobuf:"bytes,1,opt,name=prompt,proto3" json:"prompt,omitempty"`
 	// Optional. The language of the input audio.
-	Language      string `protobuf:"bytes,3,opt,name=language,proto3" json:"language,omitempty"`
+	Language      string `protobuf:"bytes,2,opt,name=language,proto3" json:"language,omitempty"`
 	unknownFields protoimpl.UnknownFields
 	sizeCache     protoimpl.SizeCache
 }
@ -127,13 +125,6 @@ func (*TranscriptionConfig) Descriptor() ([]byte, []int) {
 	return file_api_v1_ai_service_proto_rawDescGZIP(), []int{1}
 }

-func (x *TranscriptionConfig) GetModel() string {
-	if x != nil {
-		return x.Model
-	}
-	return ""
-}
-
 func (x *TranscriptionConfig) GetPrompt() string {
 	if x != nil {
 		return x.Prompt
@ -304,11 +295,10 @@ const file_api_v1_ai_service_proto_rawDesc = "" +
 	"\vprovider_id\x18\x01 \x01(\tB\x03\xe0A\x02R\n" +
 	"providerId\x12>\n" +
 	"\x06config\x18\x02 \x01(\v2!.memos.api.v1.TranscriptionConfigB\x03\xe0A\x02R\x06config\x12;\n" +
-	"\x05audio\x18\x03 \x01(\v2 .memos.api.v1.TranscriptionAudioB\x03\xe0A\x02R\x05audio\"n\n" +
-	"\x13TranscriptionConfig\x12\x19\n" +
-	"\x05model\x18\x01 \x01(\tB\x03\xe0A\x01R\x05model\x12\x1b\n" +
-	"\x06prompt\x18\x02 \x01(\tB\x03\xe0A\x01R\x06prompt\x12\x1f\n" +
-	"\blanguage\x18\x03 \x01(\tB\x03\xe0A\x01R\blanguage\"\x9c\x01\n" +
+	"\x05audio\x18\x03 \x01(\v2 .memos.api.v1.TranscriptionAudioB\x03\xe0A\x02R\x05audio\"S\n" +
+	"\x13TranscriptionConfig\x12\x1b\n" +
+	"\x06prompt\x18\x01 \x01(\tB\x03\xe0A\x01R\x06prompt\x12\x1f\n" +
+	"\blanguage\x18\x02 \x01(\tB\x03\xe0A\x01R\blanguage\"\x9c\x01\n" +
 	"\x12TranscriptionAudio\x12\x1f\n" +
 	"\acontent\x18\x01 \x01(\fB\x03\xe0A\x04H\x00R\acontent\x12\x12\n" +
 	"\x03uri\x18\x02 \x01(\tH\x00R\x03uri\x12\x1f\n" +
--- a/proto/gen/api/v1/instance_service.pb.go
+++ b/proto/gen/api/v1/instance_service.pb.go
@ -98,9 +98,7 @@ type InstanceSetting_AIProviderType int32
 const (
 	InstanceSetting_AI_PROVIDER_TYPE_UNSPECIFIED InstanceSetting_AIProviderType = 0
 	InstanceSetting_OPENAI                       InstanceSetting_AIProviderType = 1
-	InstanceSetting_OPENAI_COMPATIBLE            InstanceSetting_AIProviderType = 2
-	InstanceSetting_GEMINI                       InstanceSetting_AIProviderType = 3
-	InstanceSetting_ANTHROPIC                    InstanceSetting_AIProviderType = 4
+	InstanceSetting_GEMINI                       InstanceSetting_AIProviderType = 2
 )

 // Enum value maps for InstanceSetting_AIProviderType.
@ -108,16 +106,12 @@ var (
 	InstanceSetting_AIProviderType_name = map[int32]string{
 		0: "AI_PROVIDER_TYPE_UNSPECIFIED",
 		1: "OPENAI",
-		2: "OPENAI_COMPATIBLE",
-		3: "GEMINI",
-		4: "ANTHROPIC",
+		2: "GEMINI",
 	}
 	InstanceSetting_AIProviderType_value = map[string]int32{
 		"AI_PROVIDER_TYPE_UNSPECIFIED": 0,
 		"OPENAI":                       1,
-		"OPENAI_COMPATIBLE":            2,
-		"GEMINI":                       3,
-		"ANTHROPIC":                    4,
+		"GEMINI":                       2,
 	}
 )

@ -1037,8 +1031,6 @@ type InstanceSetting_AIProviderConfig struct {
 	Endpoint string                         `protobuf:"bytes,4,opt,name=endpoint,proto3" json:"endpoint,omitempty"`
 	// api_key is write-only and is never returned by GetInstanceSetting.
 	ApiKey string `protobuf:"bytes,5,opt,name=api_key,json=apiKey,proto3" json:"api_key,omitempty"`
-	Models       []string `protobuf:"bytes,6,rep,name=models,proto3" json:"models,omitempty"`
-	DefaultModel string   `protobuf:"bytes,7,opt,name=default_model,json=defaultModel,proto3" json:"default_model,omitempty"`
 	// api_key_set indicates whether an API key is stored for this provider.
 	ApiKeySet bool `protobuf:"varint,8,opt,name=api_key_set,json=apiKeySet,proto3" json:"api_key_set,omitempty"`
 	// api_key_hint is a masked hint for the stored API key.
@ -1112,20 +1104,6 @@ func (x *InstanceSetting_AIProviderConfig) GetApiKey() string {
 	return ""
 }

-func (x *InstanceSetting_AIProviderConfig) GetModels() []string {
-	if x != nil {
-		return x.Models
-	}
-	return nil
-}
-
-func (x *InstanceSetting_AIProviderConfig) GetDefaultModel() string {
-	if x != nil {
-		return x.DefaultModel
-	}
-	return ""
-}
-
 func (x *InstanceSetting_AIProviderConfig) GetApiKeySet() bool {
 	if x != nil {
 		return x.ApiKeySet
@ -1414,7 +1392,7 @@ const file_api_v1_instance_service_proto_rawDesc = "" +
 	"\x04demo\x18\x03 \x01(\bR\x04demo\x12!\n" +
 	"\finstance_url\x18\x06 \x01(\tR\vinstanceUrl\x12(\n" +
 	"\x05admin\x18\a \x01(\v2\x12.memos.api.v1.UserR\x05admin\"\x1b\n" +
-	"\x19GetInstanceProfileRequest\"\xe2\x1a\n" +
+	"\x19GetInstanceProfileRequest\"\xff\x19\n" +
 	"\x0fInstanceSetting\x12\x17\n" +
 	"\x04name\x18\x01 \x01(\tB\x03\xe0A\bR\x04name\x12W\n" +
 	"\x0fgeneral_setting\x18\x02 \x01(\v2,.memos.api.v1.InstanceSetting.GeneralSettingH\x00R\x0egeneralSetting\x12W\n" +
@ -1483,15 +1461,13 @@ const file_api_v1_instance_service_proto_rawDesc = "" +
 	"\ause_ssl\x18\n" +
 	" \x01(\bR\x06useSsl\x1aY\n" +
 	"\tAISetting\x12L\n" +
-	"\tproviders\x18\x01 \x03(\v2..memos.api.v1.InstanceSetting.AIProviderConfigR\tproviders\x1a\xbd\x02\n" +
+	"\tproviders\x18\x01 \x03(\v2..memos.api.v1.InstanceSetting.AIProviderConfigR\tproviders\x1a\x80\x02\n" +
 	"\x10AIProviderConfig\x12\x0e\n" +
 	"\x02id\x18\x01 \x01(\tR\x02id\x12\x14\n" +
 	"\x05title\x18\x02 \x01(\tR\x05title\x12@\n" +
 	"\x04type\x18\x03 \x01(\x0e2,.memos.api.v1.InstanceSetting.AIProviderTypeR\x04type\x12\x1a\n" +
 	"\bendpoint\x18\x04 \x01(\tR\bendpoint\x12\x1c\n" +
-	"\aapi_key\x18\x05 \x01(\tB\x03\xe0A\x04R\x06apiKey\x12\x16\n" +
-	"\x06models\x18\x06 \x03(\tR\x06models\x12#\n" +
-	"\rdefault_model\x18\a \x01(\tR\fdefaultModel\x12#\n" +
+	"\aapi_key\x18\x05 \x01(\tB\x03\xe0A\x04R\x06apiKey\x12#\n" +
 	"\vapi_key_set\x18\b \x01(\bB\x03\xe0A\x03R\tapiKeySet\x12%\n" +
 	"\fapi_key_hint\x18\t \x01(\tB\x03\xe0A\x03R\n" +
 	"apiKeyHint\"j\n" +
@ -1502,15 +1478,13 @@ const file_api_v1_instance_service_proto_rawDesc = "" +
 	"\fMEMO_RELATED\x10\x03\x12\b\n" +
 	"\x04TAGS\x10\x04\x12\x10\n" +
 	"\fNOTIFICATION\x10\x05\x12\x06\n" +
-	"\x02AI\x10\x06\"p\n" +
+	"\x02AI\x10\x06\"J\n" +
 	"\x0eAIProviderType\x12 \n" +
 	"\x1cAI_PROVIDER_TYPE_UNSPECIFIED\x10\x00\x12\n" +
 	"\n" +
-	"\x06OPENAI\x10\x01\x12\x15\n" +
-	"\x11OPENAI_COMPATIBLE\x10\x02\x12\n" +
+	"\x06OPENAI\x10\x01\x12\n" +
 	"\n" +
-	"\x06GEMINI\x10\x03\x12\r\n" +
-	"\tANTHROPIC\x10\x04:a\xeaA^\n" +
+	"\x06GEMINI\x10\x02:a\xeaA^\n" +
 	"\x1cmemos.api.v1/InstanceSetting\x12\x1binstance/settings/{setting}*\x10instanceSettings2\x0finstanceSettingB\a\n" +
 	"\x05value\"U\n" +
 	"\x19GetInstanceSettingRequest\x128\n" +
--- a/proto/gen/openapi.yaml
+++ b/proto/gen/openapi.yaml
@ -2419,9 +2419,7 @@ components:
                    enum:
                        - AI_PROVIDER_TYPE_UNSPECIFIED
                        - OPENAI
-                        - OPENAI_COMPATIBLE
                        - GEMINI
-                        - ANTHROPIC
                    type: string
                    format: enum
                endpoint:
@ -2430,12 +2428,6 @@ components:
                    writeOnly: true
                    type: string
                    description: api_key is write-only and is never returned by GetInstanceSetting.
-                models:
-                    type: array
-                    items:
-                        type: string
-                defaultModel:
-                    type: string
                apiKeySet:
                    readOnly: true
                    type: boolean
@ -3261,9 +3253,6 @@ components:
        TranscriptionConfig:
            type: object
            properties:
-                model:
-                    type: string
-                    description: Optional. The model to use. If empty, the provider's default model is used.
                prompt:
                    type: string
                    description: Optional. A prompt to improve transcription quality.
--- a/proto/gen/store/instance_setting.pb.go
+++ b/proto/gen/store/instance_setting.pb.go
@ -98,9 +98,7 @@ type AIProviderType int32
 const (
 	AIProviderType_AI_PROVIDER_TYPE_UNSPECIFIED AIProviderType = 0
 	AIProviderType_OPENAI                       AIProviderType = 1
-	AIProviderType_OPENAI_COMPATIBLE            AIProviderType = 2
-	AIProviderType_GEMINI                       AIProviderType = 3
-	AIProviderType_ANTHROPIC                    AIProviderType = 4
+	AIProviderType_GEMINI                       AIProviderType = 2
 )

 // Enum value maps for AIProviderType.
@ -108,16 +106,12 @@ var (
 	AIProviderType_name = map[int32]string{
 		0: "AI_PROVIDER_TYPE_UNSPECIFIED",
 		1: "OPENAI",
-		2: "OPENAI_COMPATIBLE",
-		3: "GEMINI",
-		4: "ANTHROPIC",
+		2: "GEMINI",
 	}
 	AIProviderType_value = map[string]int32{
 		"AI_PROVIDER_TYPE_UNSPECIFIED": 0,
 		"OPENAI":                       1,
-		"OPENAI_COMPATIBLE":            2,
-		"GEMINI":                       3,
-		"ANTHROPIC":                    4,
+		"GEMINI":                       2,
 	}
 )

@ -1027,8 +1021,6 @@ type AIProviderConfig struct {
 	Endpoint string                 `protobuf:"bytes,4,opt,name=endpoint,proto3" json:"endpoint,omitempty"`
 	// api_key is write-only at the API layer and is required by the server to call providers.
 	ApiKey        string `protobuf:"bytes,5,opt,name=api_key,json=apiKey,proto3" json:"api_key,omitempty"`
-	Models        []string `protobuf:"bytes,6,rep,name=models,proto3" json:"models,omitempty"`
-	DefaultModel  string   `protobuf:"bytes,7,opt,name=default_model,json=defaultModel,proto3" json:"default_model,omitempty"`
 	unknownFields protoimpl.UnknownFields
 	sizeCache     protoimpl.SizeCache
 }
@ -1098,20 +1090,6 @@ func (x *AIProviderConfig) GetApiKey() string {
 	return ""
 }

-func (x *AIProviderConfig) GetModels() []string {
-	if x != nil {
-		return x.Models
-	}
-	return nil
-}
-
-func (x *AIProviderConfig) GetDefaultModel() string {
-	if x != nil {
-		return x.DefaultModel
-	}
-	return ""
-}
-
 type InstanceNotificationSetting_EmailSetting struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
 	Enabled       bool                   `protobuf:"varint,1,opt,name=enabled,proto3" json:"enabled,omitempty"`
@ -1307,15 +1285,13 @@ const file_store_instance_setting_proto_rawDesc = "" +
 	"\ause_ssl\x18\n" +
 	" \x01(\bR\x06useSsl\"P\n" +
 	"\x11InstanceAISetting\x12;\n" +
-	"\tproviders\x18\x01 \x03(\v2\x1d.memos.store.AIProviderConfigR\tproviders\"\xdb\x01\n" +
+	"\tproviders\x18\x01 \x03(\v2\x1d.memos.store.AIProviderConfigR\tproviders\"\x9e\x01\n" +
 	"\x10AIProviderConfig\x12\x0e\n" +
 	"\x02id\x18\x01 \x01(\tR\x02id\x12\x14\n" +
 	"\x05title\x18\x02 \x01(\tR\x05title\x12/\n" +
 	"\x04type\x18\x03 \x01(\x0e2\x1b.memos.store.AIProviderTypeR\x04type\x12\x1a\n" +
 	"\bendpoint\x18\x04 \x01(\tR\bendpoint\x12\x17\n" +
-	"\aapi_key\x18\x05 \x01(\tR\x06apiKey\x12\x16\n" +
-	"\x06models\x18\x06 \x03(\tR\x06models\x12#\n" +
-	"\rdefault_model\x18\a \x01(\tR\fdefaultModel*\x95\x01\n" +
+	"\aapi_key\x18\x05 \x01(\tR\x06apiKey*\x95\x01\n" +
 	"\x12InstanceSettingKey\x12$\n" +
 	" INSTANCE_SETTING_KEY_UNSPECIFIED\x10\x00\x12\t\n" +
 	"\x05BASIC\x10\x01\x12\v\n" +
@ -1324,15 +1300,13 @@ const file_store_instance_setting_proto_rawDesc = "" +
 	"\fMEMO_RELATED\x10\x04\x12\b\n" +
 	"\x04TAGS\x10\x05\x12\x10\n" +
 	"\fNOTIFICATION\x10\x06\x12\x06\n" +
-	"\x02AI\x10\a*p\n" +
+	"\x02AI\x10\a*J\n" +
 	"\x0eAIProviderType\x12 \n" +
 	"\x1cAI_PROVIDER_TYPE_UNSPECIFIED\x10\x00\x12\n" +
 	"\n" +
-	"\x06OPENAI\x10\x01\x12\x15\n" +
-	"\x11OPENAI_COMPATIBLE\x10\x02\x12\n" +
+	"\x06OPENAI\x10\x01\x12\n" +
 	"\n" +
-	"\x06GEMINI\x10\x03\x12\r\n" +
-	"\tANTHROPIC\x10\x04B\x9f\x01\n" +
+	"\x06GEMINI\x10\x02B\x9f\x01\n" +
 	"\x0fcom.memos.storeB\x14InstanceSettingProtoP\x01Z)github.com/usememos/memos/proto/gen/store\xa2\x02\x03MSX\xaa\x02\vMemos.Store\xca\x02\vMemos\\Store\xe2\x02\x17Memos\\Store\\GPBMetadata\xea\x02\fMemos::Storeb\x06proto3"

 var (
--- a/proto/store/instance_setting.proto
+++ b/proto/store/instance_setting.proto
@ -158,14 +158,10 @@ message AIProviderConfig {
  string endpoint = 4;
  // api_key is write-only at the API layer and is required by the server to call providers.
  string api_key = 5;
-  repeated string models = 6;
-  string default_model = 7;
 }

 enum AIProviderType {
  AI_PROVIDER_TYPE_UNSPECIFIED = 0;
  OPENAI = 1;
-  OPENAI_COMPATIBLE = 2;
-  GEMINI = 3;
-  ANTHROPIC = 4;
+  GEMINI = 2;
 }
--- a/server/router/api/v1/ai_service.go
+++ b/server/router/api/v1/ai_service.go
@ -93,7 +93,7 @@ func (s *APIV1Service) Transcribe(ctx context.Context, request *v1pb.TranscribeR
 		return nil, status.Errorf(codes.InvalidArgument, "audio content type %q is not supported", contentType)
 	}

-	provider, model, err := s.resolveAIProviderForTranscription(ctx, request.ProviderId, request.Config.GetModel())
+	provider, model, err := s.resolveAIProviderForTranscription(ctx, request.ProviderId)
 	if err != nil {
 		return nil, err
 	}
@ -119,7 +119,7 @@ func (s *APIV1Service) Transcribe(ctx context.Context, request *v1pb.TranscribeR
 	}, nil
 }

-func (s *APIV1Service) resolveAIProviderForTranscription(ctx context.Context, providerID string, model string) (ai.ProviderConfig, string, error) {
+func (s *APIV1Service) resolveAIProviderForTranscription(ctx context.Context, providerID string) (ai.ProviderConfig, string, error) {
 	setting, err := s.Store.GetInstanceAISetting(ctx)
 	if err != nil {
 		return ai.ProviderConfig{}, "", status.Errorf(codes.Internal, "failed to get AI setting: %v", err)
@ -137,15 +137,9 @@ func (s *APIV1Service) resolveAIProviderForTranscription(ctx context.Context, pr
 	if err != nil {
 		return ai.ProviderConfig{}, "", status.Errorf(codes.NotFound, "AI provider not found")
 	}
-	selectedModel := strings.TrimSpace(model)
-	if selectedModel == "" {
-		selectedModel = provider.DefaultModel
-	}
-	if selectedModel == "" {
-		return ai.ProviderConfig{}, "", status.Errorf(codes.InvalidArgument, "model is required")
-	}
-	if !containsString(provider.Models, selectedModel) {
-		return ai.ProviderConfig{}, "", status.Errorf(codes.InvalidArgument, "model %q is not configured for provider %q", selectedModel, provider.ID)
+	selectedModel, err := ai.DefaultTranscriptionModel(provider.Type)
+	if err != nil {
+		return ai.ProviderConfig{}, "", status.Errorf(codes.InvalidArgument, "%v", err)
 	}
 	return *provider, selectedModel, nil
 }
@ -157,8 +151,6 @@ func convertAIProviderConfigFromStore(provider *storepb.AIProviderConfig) ai.Pro
 		Type:     convertAIProviderTypeFromStore(provider.GetType()),
 		Endpoint: provider.GetEndpoint(),
 		APIKey:   provider.GetApiKey(),
-		Models:       provider.GetModels(),
-		DefaultModel: provider.GetDefaultModel(),
 	}
 }

@ -166,12 +158,8 @@ func convertAIProviderTypeFromStore(providerType storepb.AIProviderType) ai.Prov
 	switch providerType {
 	case storepb.AIProviderType_OPENAI:
 		return ai.ProviderOpenAI
-	case storepb.AIProviderType_OPENAI_COMPATIBLE:
-		return ai.ProviderOpenAICompatible
 	case storepb.AIProviderType_GEMINI:
 		return ai.ProviderGemini
-	case storepb.AIProviderType_ANTHROPIC:
-		return ai.ProviderAnthropic
 	default:
 		return ""
 	}
@ -179,7 +167,7 @@ func convertAIProviderTypeFromStore(providerType storepb.AIProviderType) ai.Prov

 func newAITranscriber(provider ai.ProviderConfig) (ai.Transcriber, error) {
 	switch provider.Type {
-	case ai.ProviderOpenAI, ai.ProviderOpenAICompatible:
+	case ai.ProviderOpenAI:
 		return openai.NewTranscriber(provider)
 	case ai.ProviderGemini:
 		return gemini.NewTranscriber(provider)
@ -188,15 +176,6 @@ func newAITranscriber(provider ai.ProviderConfig) (ai.Transcriber, error) {
 	}
 }

-func containsString(values []string, target string) bool {
-	for _, value := range values {
-		if value == target {
-			return true
-		}
-	}
-	return false
-}
-
 func isSupportedTranscriptionContentType(contentType string) bool {
 	mediaType, _, err := mime.ParseMediaType(strings.TrimSpace(contentType))
 	if err != nil {
--- a/server/router/api/v1/instance_service.go
+++ b/server/router/api/v1/instance_service.go
@ -5,7 +5,6 @@ import (
 	"fmt"
 	"math"
 	"regexp"
-	"slices"
 	"strings"

 	"github.com/lithammer/shortuuid/v4"
@ -75,10 +74,9 @@ func (s *APIV1Service) GetInstanceSetting(ctx context.Context, request *v1pb.Get
 		return nil, status.Errorf(codes.NotFound, "instance setting not found")
 	}

-	// Storage, notification, and AI settings contain credentials; restrict to admins only.
+	// Storage and notification settings contain credentials; restrict to admins only.
 	if instanceSetting.Key == storepb.InstanceSettingKey_STORAGE ||
-		instanceSetting.Key == storepb.InstanceSettingKey_NOTIFICATION ||
-		instanceSetting.Key == storepb.InstanceSettingKey_AI {
+		instanceSetting.Key == storepb.InstanceSettingKey_NOTIFICATION {
 		user, err := s.fetchCurrentUser(ctx)
 		if err != nil {
 			return nil, status.Errorf(codes.Internal, "failed to get current user: %v", err)
@ -90,6 +88,15 @@ func (s *APIV1Service) GetInstanceSetting(ctx context.Context, request *v1pb.Get
 			return nil, status.Errorf(codes.PermissionDenied, "permission denied")
 		}
 	}
+	if instanceSetting.Key == storepb.InstanceSettingKey_AI {
+		user, err := s.fetchCurrentUser(ctx)
+		if err != nil {
+			return nil, status.Errorf(codes.Internal, "failed to get current user: %v", err)
+		}
+		if user == nil {
+			return nil, status.Errorf(codes.Unauthenticated, "user not authenticated")
+		}
+	}

 	return convertInstanceSettingFromStore(instanceSetting), nil
 }
@ -433,8 +440,6 @@ func convertInstanceAISettingFromStore(setting *storepb.InstanceAISetting) *v1pb
 			Title:      provider.GetTitle(),
 			Type:       v1pb.InstanceSetting_AIProviderType(provider.GetType()),
 			Endpoint:   provider.GetEndpoint(),
-			Models:       provider.GetModels(),
-			DefaultModel: provider.GetDefaultModel(),
 			ApiKeySet:  apiKey != "",
 			ApiKeyHint: maskAPIKey(apiKey),
 		})
@ -460,8 +465,6 @@ func convertInstanceAISettingToStore(setting *v1pb.InstanceSetting_AISetting) *s
 			Type:     storepb.AIProviderType(provider.GetType()),
 			Endpoint: provider.GetEndpoint(),
 			ApiKey:   provider.GetApiKey(),
-			Models:       provider.GetModels(),
-			DefaultModel: provider.GetDefaultModel(),
 		})
 	}
 	return aiSetting
@ -515,31 +518,16 @@ func (s *APIV1Service) prepareInstanceAISettingForUpdate(ctx context.Context, se
 		if provider.Title == "" {
 			return errors.New("provider title is required")
 		}
-		if provider.Type == storepb.AIProviderType_AI_PROVIDER_TYPE_UNSPECIFIED {
-			return errors.Errorf("provider %q type is required", provider.Id)
+		if provider.Type != storepb.AIProviderType_OPENAI && provider.Type != storepb.AIProviderType_GEMINI {
+			return errors.Errorf("provider %q has unsupported type", provider.Id)
 		}

 		provider.Endpoint = strings.TrimSpace(provider.Endpoint)
 		if provider.Type == storepb.AIProviderType_OPENAI && provider.Endpoint == "" {
 			provider.Endpoint = "https://api.openai.com/v1"
 		}
-		if provider.Type == storepb.AIProviderType_ANTHROPIC && provider.Endpoint == "" {
-			provider.Endpoint = "https://api.anthropic.com/v1"
-		}
-		if provider.Type == storepb.AIProviderType_OPENAI_COMPATIBLE && provider.Endpoint == "" {
-			return errors.Errorf("provider %q endpoint is required", provider.Id)
-		}
-
-		provider.Models = normalizeAIModels(provider.Models)
-		if len(provider.Models) == 0 {
-			return errors.Errorf("provider %q must define at least one model", provider.Id)
-		}
-		provider.DefaultModel = strings.TrimSpace(provider.DefaultModel)
-		if provider.DefaultModel == "" {
-			provider.DefaultModel = provider.Models[0]
-		}
-		if !slices.Contains(provider.Models, provider.DefaultModel) {
-			return errors.Errorf("provider %q default model %q must be included in models", provider.Id, provider.DefaultModel)
+		if provider.Type == storepb.AIProviderType_GEMINI && provider.Endpoint == "" {
+			provider.Endpoint = "https://generativelanguage.googleapis.com/v1beta"
 		}

 		if provider.ApiKey == "" {
@ -554,20 +542,6 @@ func (s *APIV1Service) prepareInstanceAISettingForUpdate(ctx context.Context, se
 	return nil
 }

-func normalizeAIModels(models []string) []string {
-	normalized := []string{}
-	seen := map[string]bool{}
-	for _, model := range models {
-		model = strings.TrimSpace(model)
-		if model == "" || seen[model] {
-			continue
-		}
-		seen[model] = true
-		normalized = append(normalized, model)
-	}
-	return normalized
-}
-
 func maskAPIKey(apiKey string) string {
 	if apiKey == "" {
 		return ""
--- a/server/router/api/v1/test/ai_service_test.go
+++ b/server/router/api/v1/test/ai_service_test.go
@ -22,9 +22,7 @@ func TestTranscribe(t *testing.T) {

 		_, err := ts.Service.Transcribe(ctx, &v1pb.TranscribeRequest{
 			ProviderId: "openai-main",
-			Config: &v1pb.TranscriptionConfig{
-				Model: "gpt-4o-transcribe",
-			},
+			Config:     &v1pb.TranscriptionConfig{},
 			Audio: &v1pb.TranscriptionAudio{
 				Source:      &v1pb.TranscriptionAudio_Content{Content: []byte("RIFF")},
 				Filename:    "voice.wav",
@ -70,11 +68,9 @@ func TestTranscribe(t *testing.T) {
 						{
 							Id:       "openai-main",
 							Title:    "OpenAI",
-							Type:         storepb.AIProviderType_OPENAI_COMPATIBLE,
+							Type:     storepb.AIProviderType_OPENAI,
 							Endpoint: openAIServer.URL,
 							ApiKey:   "sk-test",
-							Models:       []string{"gpt-4o-transcribe"},
-							DefaultModel: "gpt-4o-transcribe",
 						},
 					},
 				},
@ -97,6 +93,48 @@ func TestTranscribe(t *testing.T) {
 		require.Equal(t, "transcribed text", resp.Text)
 	})

+	t.Run("returns provider error without rewriting it", func(t *testing.T) {
+		ts := NewTestService(t)
+		defer ts.Cleanup()
+
+		user, err := ts.CreateRegularUser(ctx, "notfound-user")
+		require.NoError(t, err)
+		userCtx := ts.CreateUserContext(ctx, user.ID)
+
+		openAIServer := httptest.NewServer(http.NotFoundHandler())
+		defer openAIServer.Close()
+
+		_, err = ts.Store.UpsertInstanceSetting(ctx, &storepb.InstanceSetting{
+			Key: storepb.InstanceSettingKey_AI,
+			Value: &storepb.InstanceSetting_AiSetting{
+				AiSetting: &storepb.InstanceAISetting{
+					Providers: []*storepb.AIProviderConfig{
+						{
+							Id:       "openai-main",
+							Title:    "OpenAI",
+							Type:     storepb.AIProviderType_OPENAI,
+							Endpoint: openAIServer.URL,
+							ApiKey:   "sk-test",
+						},
+					},
+				},
+			},
+		})
+		require.NoError(t, err)
+
+		_, err = ts.Service.Transcribe(userCtx, &v1pb.TranscribeRequest{
+			ProviderId: "openai-main",
+			Config:     &v1pb.TranscriptionConfig{},
+			Audio: &v1pb.TranscriptionAudio{
+				Source:      &v1pb.TranscriptionAudio_Content{Content: []byte("RIFF")},
+				Filename:    "voice.wav",
+				ContentType: "audio/wav",
+			},
+		})
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "failed to transcribe audio")
+	})
+
 	t.Run("transcribes audio file with Gemini provider", func(t *testing.T) {
 		ts := NewTestService(t)
 		defer ts.Cleanup()
@ -132,8 +170,6 @@ func TestTranscribe(t *testing.T) {
 							Type:     storepb.AIProviderType_GEMINI,
 							Endpoint: geminiServer.URL + "/v1beta",
 							ApiKey:   "gemini-key",
-							Models:       []string{"gemini-2.5-flash"},
-							DefaultModel: "gemini-2.5-flash",
 						},
 					},
 				},
@ -154,48 +190,7 @@ func TestTranscribe(t *testing.T) {
 		require.Equal(t, "gemini transcript", resp.Text)
 	})

-	t.Run("rejects Anthropic transcription as unsupported", func(t *testing.T) {
-		ts := NewTestService(t)
-		defer ts.Cleanup()
-
-		user, err := ts.CreateRegularUser(ctx, "anthropic-user")
-		require.NoError(t, err)
-		userCtx := ts.CreateUserContext(ctx, user.ID)
-
-		_, err = ts.Store.UpsertInstanceSetting(ctx, &storepb.InstanceSetting{
-			Key: storepb.InstanceSettingKey_AI,
-			Value: &storepb.InstanceSetting_AiSetting{
-				AiSetting: &storepb.InstanceAISetting{
-					Providers: []*storepb.AIProviderConfig{
-						{
-							Id:           "anthropic-main",
-							Title:        "Anthropic",
-							Type:         storepb.AIProviderType_ANTHROPIC,
-							Endpoint:     "https://api.anthropic.com/v1",
-							ApiKey:       "sk-ant-test",
-							Models:       []string{"claude-sonnet-4-5"},
-							DefaultModel: "claude-sonnet-4-5",
-						},
-					},
-				},
-			},
-		})
-		require.NoError(t, err)
-
-		_, err = ts.Service.Transcribe(userCtx, &v1pb.TranscribeRequest{
-			ProviderId: "anthropic-main",
-			Config:     &v1pb.TranscriptionConfig{},
-			Audio: &v1pb.TranscriptionAudio{
-				Source:      &v1pb.TranscriptionAudio_Content{Content: []byte("RIFF")},
-				Filename:    "voice.wav",
-				ContentType: "audio/wav",
-			},
-		})
-		require.Error(t, err)
-		require.Contains(t, err.Error(), "capability unsupported")
-	})
-
-	t.Run("rejects unconfigured model", func(t *testing.T) {
+	t.Run("uses built-in transcription model", func(t *testing.T) {
 		ts := NewTestService(t)
 		defer ts.Cleanup()

@ -203,6 +198,16 @@ func TestTranscribe(t *testing.T) {
 		require.NoError(t, err)
 		userCtx := ts.CreateUserContext(ctx, user.ID)

+		openAIServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			require.NoError(t, r.ParseMultipartForm(10<<20))
+			require.Equal(t, "gpt-4o-transcribe", r.FormValue("model"))
+			w.Header().Set("Content-Type", "application/json")
+			require.NoError(t, json.NewEncoder(w).Encode(map[string]string{
+				"text": "built-in model",
+			}))
+		}))
+		defer openAIServer.Close()
+
 		_, err = ts.Store.UpsertInstanceSetting(ctx, &storepb.InstanceSetting{
 			Key: storepb.InstanceSettingKey_AI,
 			Value: &storepb.InstanceSetting_AiSetting{
@ -211,11 +216,9 @@ func TestTranscribe(t *testing.T) {
 						{
 							Id:       "openai-main",
 							Title:    "OpenAI",
-							Type:         storepb.AIProviderType_OPENAI_COMPATIBLE,
-							Endpoint:     "https://example.com/v1",
+							Type:     storepb.AIProviderType_OPENAI,
+							Endpoint: openAIServer.URL,
 							ApiKey:   "sk-test",
-							Models:       []string{"gpt-4o-transcribe"},
-							DefaultModel: "gpt-4o-transcribe",
 						},
 					},
 				},
@ -223,19 +226,17 @@ func TestTranscribe(t *testing.T) {
 		})
 		require.NoError(t, err)

-		_, err = ts.Service.Transcribe(userCtx, &v1pb.TranscribeRequest{
+		resp, err := ts.Service.Transcribe(userCtx, &v1pb.TranscribeRequest{
 			ProviderId: "openai-main",
-			Config: &v1pb.TranscriptionConfig{
-				Model: "other-model",
-			},
+			Config:     &v1pb.TranscriptionConfig{},
 			Audio: &v1pb.TranscriptionAudio{
 				Source:      &v1pb.TranscriptionAudio_Content{Content: []byte("RIFF")},
 				Filename:    "voice.wav",
 				ContentType: "audio/wav",
 			},
 		})
-		require.Error(t, err)
-		require.Contains(t, err.Error(), "not configured")
+		require.NoError(t, err)
+		require.Equal(t, "built-in model", resp.Text)
 	})

 	t.Run("rejects non-audio content before provider call", func(t *testing.T) {
@ -254,11 +255,9 @@ func TestTranscribe(t *testing.T) {
 						{
 							Id:       "openai-main",
 							Title:    "OpenAI",
-							Type:         storepb.AIProviderType_OPENAI_COMPATIBLE,
+							Type:     storepb.AIProviderType_OPENAI,
 							Endpoint: "https://example.com/v1",
 							ApiKey:   "sk-test",
-							Models:       []string{"gpt-4o-transcribe"},
-							DefaultModel: "gpt-4o-transcribe",
 						},
 					},
 				},
@ -268,9 +267,7 @@ func TestTranscribe(t *testing.T) {

 		_, err = ts.Service.Transcribe(userCtx, &v1pb.TranscribeRequest{
 			ProviderId: "openai-main",
-			Config: &v1pb.TranscriptionConfig{
-				Model: "gpt-4o-transcribe",
-			},
+			Config:     &v1pb.TranscriptionConfig{},
 			Audio: &v1pb.TranscriptionAudio{
 				Source:      &v1pb.TranscriptionAudio_Content{Content: []byte("not audio")},
 				Filename:    "notes.txt",
--- a/server/router/api/v1/test/instance_service_test.go
+++ b/server/router/api/v1/test/instance_service_test.go
@ -238,7 +238,7 @@ func TestGetInstanceSetting(t *testing.T) {
 			"SmtpPassword must never be returned in responses")
 	})

-	t.Run("GetInstanceSetting - AI setting requires admin", func(t *testing.T) {
+	t.Run("GetInstanceSetting - AI setting requires authenticated user", func(t *testing.T) {
 		ts := NewTestService(t)
 		defer ts.Cleanup()

@ -256,11 +256,12 @@ func TestGetInstanceSetting(t *testing.T) {
 		require.Error(t, err)
 		require.Contains(t, err.Error(), "not authenticated")

-		_, err = ts.Service.GetInstanceSetting(userCtx, req)
-		require.Error(t, err)
-		require.Contains(t, err.Error(), "permission denied")
+		resp, err := ts.Service.GetInstanceSetting(userCtx, req)
+		require.NoError(t, err)
+		require.NotNil(t, resp.GetAiSetting())
+		require.Empty(t, resp.GetAiSetting().GetProviders())

-		resp, err := ts.Service.GetInstanceSetting(adminCtx, req)
+		resp, err = ts.Service.GetInstanceSetting(adminCtx, req)
 		require.NoError(t, err)
 		require.NotNil(t, resp.GetAiSetting())
 		require.Empty(t, resp.GetAiSetting().GetProviders())
@ -304,8 +305,6 @@ func TestUpdateInstanceSetting(t *testing.T) {
 							Title:  "OpenAI",
 							Type:   v1pb.InstanceSetting_OPENAI,
 							ApiKey: "sk-test",
-							Models:       []string{"gpt-4o-transcribe"},
-							DefaultModel: "gpt-4o-transcribe",
 						},
 					},
 				},
@ -573,8 +572,6 @@ func TestUpdateInstanceSetting(t *testing.T) {
 								Title:  "OpenAI",
 								Type:   v1pb.InstanceSetting_OPENAI,
 								ApiKey: "sk-original",
-								Models:       []string{"gpt-5.4", "gpt-5.4-mini"},
-								DefaultModel: "gpt-5.4",
 							},
 						},
 					},
@ -605,8 +602,6 @@ func TestUpdateInstanceSetting(t *testing.T) {
 								Title:  "OpenAI primary",
 								Type:   v1pb.InstanceSetting_OPENAI,
 								ApiKey: "",
-								Models:       []string{"gpt-5.4-mini", "gpt-5.4-mini", "gpt-5.4"},
-								DefaultModel: "",
 							},
 						},
 					},
@ -621,42 +616,5 @@ func TestUpdateInstanceSetting(t *testing.T) {
 		require.Equal(t, "sk-original", stored.GetProviders()[0].GetApiKey(),
 			"existing AI provider API key must be preserved when an empty value is sent")
 		require.Equal(t, "OpenAI primary", stored.GetProviders()[0].GetTitle())
-		require.Equal(t, []string{"gpt-5.4-mini", "gpt-5.4"}, stored.GetProviders()[0].GetModels())
-		require.Equal(t, "gpt-5.4-mini", stored.GetProviders()[0].GetDefaultModel())
-	})
-
-	t.Run("UpdateInstanceSetting - Anthropic provider gets default endpoint", func(t *testing.T) {
-		ts := NewTestService(t)
-		defer ts.Cleanup()
-
-		hostUser, err := ts.CreateHostUser(ctx, "admin")
-		require.NoError(t, err)
-		adminCtx := ts.CreateUserContext(ctx, hostUser.ID)
-
-		_, err = ts.Service.UpdateInstanceSetting(adminCtx, &v1pb.UpdateInstanceSettingRequest{
-			Setting: &v1pb.InstanceSetting{
-				Name: "instance/settings/AI",
-				Value: &v1pb.InstanceSetting_AiSetting{
-					AiSetting: &v1pb.InstanceSetting_AISetting{
-						Providers: []*v1pb.InstanceSetting_AIProviderConfig{
-							{
-								Id:           "anthropic-main",
-								Title:        "Anthropic",
-								Type:         v1pb.InstanceSetting_ANTHROPIC,
-								ApiKey:       "sk-ant-test",
-								Models:       []string{"claude-sonnet-4-5"},
-								DefaultModel: "claude-sonnet-4-5",
-							},
-						},
-					},
-				},
-			},
-		})
-		require.NoError(t, err)
-
-		stored, err := ts.Store.GetInstanceAISetting(ctx)
-		require.NoError(t, err)
-		require.Len(t, stored.GetProviders(), 1)
-		require.Equal(t, "https://api.anthropic.com/v1", stored.GetProviders()[0].GetEndpoint())
 	})
 }
--- a/store/test/instance_setting_test.go
+++ b/store/test/instance_setting_test.go
@ -347,17 +347,13 @@ func TestInstanceSettingAISetting(t *testing.T) {
 						Type:     storepb.AIProviderType_OPENAI,
 						Endpoint: "https://api.openai.com/v1",
 						ApiKey:   "sk-test",
-						Models:       []string{"gpt-5.4", "gpt-5.4-mini"},
-						DefaultModel: "gpt-5.4",
 					},
 					{
-						Id:           "company-gateway",
-						Title:        "Company Gateway",
-						Type:         storepb.AIProviderType_OPENAI_COMPATIBLE,
-						Endpoint:     "https://llm.example.com/v1",
-						ApiKey:       "gw-test",
-						Models:       []string{"qwen-plus"},
-						DefaultModel: "qwen-plus",
+						Id:       "gemini-main",
+						Title:    "Gemini",
+						Type:     storepb.AIProviderType_GEMINI,
+						Endpoint: "https://generativelanguage.googleapis.com/v1beta",
+						ApiKey:   "gemini-test",
 					},
 				},
 			},
@ -370,7 +366,7 @@ func TestInstanceSettingAISetting(t *testing.T) {
 	require.Len(t, aiSetting.Providers, 2)
 	require.Equal(t, "openai-main", aiSetting.Providers[0].Id)
 	require.Equal(t, "sk-test", aiSetting.Providers[0].ApiKey)
-	require.Equal(t, "company-gateway", aiSetting.Providers[1].Id)
+	require.Equal(t, "gemini-main", aiSetting.Providers[1].Id)

 	ts.Close()
 }
--- a/web/src/components/MemoEditor/components/AudioRecorderPanel.tsx
+++ b/web/src/components/MemoEditor/components/AudioRecorderPanel.tsx
@ -1,21 +1,35 @@
-import { LoaderCircleIcon, XIcon } from "lucide-react";
+import { AudioWaveformIcon, LoaderCircleIcon, SquareIcon, XIcon } from "lucide-react";
 import type { FC } from "react";
 import { formatAudioTime } from "@/components/MemoMetadata/Attachment/attachmentHelpers";
 import { Button } from "@/components/ui/button";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { cn } from "@/lib/utils";
 import { useTranslate } from "@/utils/i18n";
 import { useAudioWaveform } from "../hooks/useAudioWaveform";
 import type { AudioRecorderPanelProps } from "../types/components";
 import { VoiceWaveform } from "./VoiceWaveform";

-export const AudioRecorderPanel: FC<AudioRecorderPanelProps> = ({ audioRecorder, mediaStream, onStop, onCancel }) => {
+export const AudioRecorderPanel: FC<AudioRecorderPanelProps> = ({
+  audioRecorder,
+  mediaStream,
+  onStop,
+  onCancel,
+  onTranscribe,
+  canTranscribe = false,
+  isTranscribing = false,
+}) => {
  const t = useTranslate();
  const { status, elapsedSeconds } = audioRecorder;

  const isRequestingPermission = status === "requesting_permission";
  const isRecording = status === "recording";
+  const isTranscribeDisabled = !canTranscribe || isRequestingPermission || isTranscribing;
  const waveformLevels = useAudioWaveform(mediaStream, isRecording && mediaStream !== null);
-  const srStatusText = isRequestingPermission ? t("editor.audio-recorder.requesting-permission") : t("editor.audio-recorder.recording");
+  const srStatusText = isTranscribing
+    ? t("editor.audio-recorder.transcribing")
+    : isRequestingPermission
+      ? t("editor.audio-recorder.requesting-permission")
+      : t("editor.audio-recorder.recording");

  return (
    <div
@ -25,10 +39,14 @@ export const AudioRecorderPanel: FC<AudioRecorderPanelProps> = ({ audioRecorder,
      )}
    >
      <div className="flex min-w-0 flex-1 items-center gap-2">
-        {isRequestingPermission ? <LoaderCircleIcon className="size-3.5 shrink-0 animate-spin text-muted-foreground" aria-hidden /> : null}
+        {isRequestingPermission || isTranscribing ? (
+          <LoaderCircleIcon className="size-3.5 shrink-0 animate-spin text-muted-foreground" aria-hidden />
+        ) : null}
        <span className="sr-only">{srStatusText}</span>
        <VoiceWaveform levels={waveformLevels} className="max-w-[200px] overflow-hidden" />
-        <span className="shrink-0 font-mono text-xs tabular-nums text-muted-foreground">{formatAudioTime(elapsedSeconds)}</span>
+        <span className="shrink-0 font-mono text-xs tabular-nums text-muted-foreground">
+          {isTranscribing ? t("editor.audio-recorder.transcribing") : formatAudioTime(elapsedSeconds)}
+        </span>
      </div>

      <div className="flex shrink-0 items-center gap-1 border-l border-border/60 pl-2">
@ -36,22 +54,43 @@ export const AudioRecorderPanel: FC<AudioRecorderPanelProps> = ({ audioRecorder,
          type="button"
          variant="ghost"
          size="icon"
-          className="size-7 shrink-0 rounded-full text-muted-foreground hover:bg-accent hover:text-foreground"
+          className="rounded-full"
          onClick={onCancel}
+          disabled={isTranscribing}
          aria-label={t("common.cancel")}
        >
-          <XIcon className="size-3.25" />
+          <XIcon className="size-4" />
        </Button>
+        <Tooltip>
+          <TooltipTrigger asChild>
+            <span className="-ml-2 inline-flex">
+              <Button
+                type="button"
+                variant="ghost"
+                size="icon"
+                className="rounded-full"
+                onClick={onTranscribe}
+                disabled={isTranscribeDisabled}
+                aria-label={canTranscribe ? t("editor.audio-recorder.transcribe") : t("editor.audio-recorder.configure-ai-provider")}
+              >
+                <AudioWaveformIcon className="size-4" />
+              </Button>
+            </span>
+          </TooltipTrigger>
+          <TooltipContent side="top">
+            <p>{canTranscribe ? t("editor.audio-recorder.transcribe") : t("editor.audio-recorder.configure-ai-provider")}</p>
+          </TooltipContent>
+        </Tooltip>
        <Button
          type="button"
          variant="destructive"
          size="icon"
-          className="size-7 shrink-0 rounded-full shadow-xs"
+          className="rounded-full"
          onClick={onStop}
-          disabled={isRequestingPermission}
+          disabled={isRequestingPermission || isTranscribing}
          aria-label={t("editor.audio-recorder.stop")}
        >
-          <span className="size-[7px] rounded-[1.5px] bg-destructive-foreground" aria-hidden />
+          <SquareIcon className="size-4" />
        </Button>
      </div>
    </div>
--- a/web/src/components/MemoEditor/hooks/useAudioRecorder.ts
+++ b/web/src/components/MemoEditor/hooks/useAudioRecorder.ts
@ -3,6 +3,7 @@ import type { LocalFile } from "../types/attachment";
 import { useBlobUrls } from "./useBlobUrls";

 const FALLBACK_AUDIO_MIME_TYPE = "audio/webm";
+export type AudioRecordingCompleteMode = "attach" | "transcribe";

 interface AudioRecorderActions {
  setAudioRecorderSupport: (value: boolean) => void;
@ -10,7 +11,8 @@ interface AudioRecorderActions {
  setAudioRecorderStatus: (value: "idle" | "requesting_permission" | "recording" | "error" | "unsupported") => void;
  setAudioRecorderElapsed: (value: number) => void;
  setAudioRecorderError: (value?: string) => void;
-  onRecordingComplete: (localFile: LocalFile) => void;
+  onRecordingComplete: (localFile: LocalFile, mode: AudioRecordingCompleteMode) => void;
+  onRecordingEmpty?: (mode: AudioRecordingCompleteMode) => void;
 }

 const AUDIO_MIME_TYPE_CANDIDATES = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"] as const;
@ -55,6 +57,7 @@ export const useAudioRecorder = (actions: AudioRecorderActions) => {
  const startedAtRef = useRef<number | null>(null);
  const elapsedTimerRef = useRef<number | null>(null);
  const recorderMimeTypeRef = useRef<string>(FALLBACK_AUDIO_MIME_TYPE);
+  const completionModeRef = useRef<AudioRecordingCompleteMode>("attach");
  const startRequestIdRef = useRef(0);
  const { createBlobUrl } = useBlobUrls();

@ -153,10 +156,13 @@ export const useAudioRecorder = (actions: AudioRecorderActions) => {

        const durationSeconds = startedAtRef.current ? Math.max(0, Math.round((Date.now() - startedAtRef.current) / 1000)) : 0;
        const blob = new Blob(chunksRef.current, { type: recorderMimeTypeRef.current });
+        const completionMode = completionModeRef.current;
+        completionModeRef.current = "attach";
        if (blob.size === 0) {
          actions.setAudioRecorderElapsed(0);
          actions.setAudioRecorderError(undefined);
          actions.setAudioRecorderStatus("idle");
+          actions.onRecordingEmpty?.(completionMode);
          resetRecorderRefs();
          return;
        }
@ -164,14 +170,17 @@ export const useAudioRecorder = (actions: AudioRecorderActions) => {
        const file = createRecordedFile(blob, recorderMimeTypeRef.current);
        const previewUrl = createBlobUrl(file);

-        actions.onRecordingComplete({
+        actions.onRecordingComplete(
+          {
            file,
            previewUrl,
            origin: "audio_recording",
            audioMeta: {
              durationSeconds,
            },
-        });
+          },
+          completionMode,
+        );
        actions.setAudioRecorderElapsed(0);
        actions.setAudioRecorderError(undefined);
        actions.setAudioRecorderStatus("idle");
@ -203,17 +212,20 @@ export const useAudioRecorder = (actions: AudioRecorderActions) => {
    }
  };

-  const stopRecording = () => {
+  const stopRecording = (mode: AudioRecordingCompleteMode = "attach") => {
    if (!mediaRecorderRef.current || mediaRecorderRef.current.state === "inactive") {
-      return;
+      return false;
    }

+    completionModeRef.current = mode;
    cleanupTimer();
    mediaRecorderRef.current.stop();
+    return true;
  };

  const resetRecording = () => {
    startRequestIdRef.current += 1;
+    completionModeRef.current = "attach";
    resetRecorderRefs();
    actions.setAudioRecorderElapsed(0);
    actions.setAudioRecorderError(undefined);
--- a/web/src/components/MemoEditor/index.tsx
+++ b/web/src/components/MemoEditor/index.tsx
@ -1,12 +1,14 @@
 import { useQueryClient } from "@tanstack/react-query";
-import { useEffect, useMemo, useRef, useState } from "react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "react-hot-toast";
 import { useAuth } from "@/contexts/AuthContext";
+import { useInstance } from "@/contexts/InstanceContext";
 import useCurrentUser from "@/hooks/useCurrentUser";
 import { memoKeys } from "@/hooks/useMemoQueries";
 import { userKeys } from "@/hooks/useUserQueries";
 import { handleError } from "@/lib/error";
 import { cn } from "@/lib/utils";
+import { InstanceSetting_AIProviderType, InstanceSetting_Key } from "@/types/proto/api/v1/instance_service_pb";
 import { useTranslate } from "@/utils/i18n";
 import { convertVisibilityFromString } from "@/utils/memo";
 import {
@ -21,9 +23,15 @@ import {
 import { FOCUS_MODE_STYLES } from "./constants";
 import type { EditorRefActions } from "./Editor";
 import { useAudioRecorder, useAutoSave, useFocusMode, useKeyboard, useMemoInit } from "./hooks";
-import { cacheService, errorService, memoService, validationService } from "./services";
+import { cacheService, errorService, memoService, transcriptionService, validationService } from "./services";
 import { EditorProvider, useEditorContext } from "./state";
 import type { MemoEditorProps } from "./types";
+import type { LocalFile } from "./types/attachment";
+
+const TRANSCRIPTION_PROVIDER_TYPES: InstanceSetting_AIProviderType[] = [
+  InstanceSetting_AIProviderType.OPENAI,
+  InstanceSetting_AIProviderType.GEMINI,
+];

 const MemoEditor = (props: MemoEditorProps) => (
  <EditorProvider>
@ -47,9 +55,15 @@ const MemoEditorImpl: React.FC<MemoEditorProps> = ({
  const editorRef = useRef<EditorRefActions>(null);
  const { state, actions, dispatch } = useEditorContext();
  const { userGeneralSetting } = useAuth();
+  const { aiSetting, fetchSetting } = useInstance();
  const [isAudioRecorderOpen, setIsAudioRecorderOpen] = useState(false);
+  const [isTranscribingAudio, setIsTranscribingAudio] = useState(false);

  const memoName = memo?.name;
+  const transcriptionProvider = useMemo(
+    () => aiSetting.providers.find((provider) => provider.apiKeySet && TRANSCRIPTION_PROVIDER_TYPES.includes(provider.type)),
+    [aiSetting.providers],
+  );

  // Get default visibility from user settings
  const defaultVisibility = userGeneralSetting?.memoVisibility ? convertVisibilityFromString(userGeneralSetting.memoVisibility) : undefined;
@ -62,6 +76,62 @@ const MemoEditorImpl: React.FC<MemoEditorProps> = ({
  // Focus mode management with body scroll lock
  useFocusMode(state.ui.isFocusMode);

+  useEffect(() => {
+    if (!currentUser) {
+      return;
+    }
+
+    void fetchSetting(InstanceSetting_Key.AI).catch(() => undefined);
+  }, [currentUser, fetchSetting]);
+
+  const insertTranscribedText = useCallback((text: string) => {
+    const editor = editorRef.current;
+    if (!editor) {
+      return;
+    }
+
+    const content = editor.getContent();
+    const cursor = editor.getCursorPosition();
+    const beforeCursor = content.slice(0, cursor);
+    const afterCursor = content.slice(cursor);
+    const prefix = beforeCursor.length === 0 || beforeCursor.endsWith("\n\n") ? "" : beforeCursor.endsWith("\n") ? "\n" : "\n\n";
+    const suffix = afterCursor.length === 0 || afterCursor.startsWith("\n\n") ? "" : afterCursor.startsWith("\n") ? "\n" : "\n\n";
+
+    editor.insertText(text, prefix, suffix);
+    editor.scrollToCursor();
+  }, []);
+
+  const handleTranscribeRecordedAudio = useCallback(
+    async (localFile: LocalFile) => {
+      if (!transcriptionProvider) {
+        dispatch(actions.addLocalFile(localFile));
+        setIsTranscribingAudio(false);
+        setIsAudioRecorderOpen(false);
+        return;
+      }
+
+      try {
+        const text = (await transcriptionService.transcribeFile(localFile.file, transcriptionProvider)).trim();
+        if (!text) {
+          dispatch(actions.addLocalFile(localFile));
+          toast.error(t("editor.audio-recorder.transcribe-empty"));
+          return;
+        }
+
+        insertTranscribedText(text);
+        toast.success(t("editor.audio-recorder.transcribe-success"));
+      } catch (error) {
+        console.error(error);
+        toast.error(errorService.getErrorMessage(error) || t("editor.audio-recorder.transcribe-error"));
+        dispatch(actions.addLocalFile(localFile));
+      } finally {
+        setIsTranscribingAudio(false);
+        setIsAudioRecorderOpen(false);
+      }
+    },
+    [actions, dispatch, insertTranscribedText, t, transcriptionProvider],
+  );
+
  const audioRecorderActions = useMemo(
    () => ({
      setAudioRecorderSupport: (value: boolean) => dispatch(actions.setAudioRecorderSupport(value)),
@ -70,12 +140,24 @@ const MemoEditorImpl: React.FC<MemoEditorProps> = ({
        dispatch(actions.setAudioRecorderStatus(value)),
      setAudioRecorderElapsed: (value: number) => dispatch(actions.setAudioRecorderElapsed(value)),
      setAudioRecorderError: (value?: string) => dispatch(actions.setAudioRecorderError(value)),
-      onRecordingComplete: (localFile: (typeof state.localFiles)[number]) => {
+      onRecordingComplete: (localFile: LocalFile, mode: "attach" | "transcribe") => {
+        if (mode === "transcribe") {
+          void handleTranscribeRecordedAudio(localFile);
+          return;
+        }
+
        dispatch(actions.addLocalFile(localFile));
        setIsAudioRecorderOpen(false);
      },
+      onRecordingEmpty: (mode: "attach" | "transcribe") => {
+        if (mode === "transcribe") {
+          setIsTranscribingAudio(false);
+          toast.error(t("editor.audio-recorder.transcribe-empty"));
+        }
+        setIsAudioRecorderOpen(false);
+      },
    }),
-    [actions, dispatch, state.localFiles],
+    [actions, dispatch, handleTranscribeRecordedAudio, t],
  );

  const audioRecorder = useAudioRecorder(audioRecorderActions);
@ -109,10 +191,23 @@ const MemoEditorImpl: React.FC<MemoEditorProps> = ({
  };

  const handleCancelAudioRecording = () => {
+    setIsTranscribingAudio(false);
    audioRecorder.resetRecording();
    setIsAudioRecorderOpen(false);
  };

+  const handleTranscribeAudioRecording = () => {
+    if (!transcriptionProvider || isTranscribingAudio) {
+      return;
+    }
+
+    setIsTranscribingAudio(true);
+    const didStop = audioRecorder.stopRecording("transcribe");
+    if (!didStop) {
+      setIsTranscribingAudio(false);
+    }
+  };
+
  useKeyboard(editorRef, handleSave);

  async function handleSave() {
@ -203,12 +298,16 @@ const MemoEditorImpl: React.FC<MemoEditorProps> = ({
        {/* Editor content grows to fill available space in focus mode */}
        <EditorContent ref={editorRef} placeholder={placeholder} />

-        {isAudioRecorderOpen && (state.audioRecorder.status === "recording" || state.audioRecorder.status === "requesting_permission") && (
+        {isAudioRecorderOpen &&
+          (state.audioRecorder.status === "recording" || state.audioRecorder.status === "requesting_permission" || isTranscribingAudio) && (
            <AudioRecorderPanel
              audioRecorder={state.audioRecorder}
              mediaStream={audioRecorder.recordingStream}
              onStop={audioRecorder.stopRecording}
              onCancel={handleCancelAudioRecording}
+              onTranscribe={handleTranscribeAudioRecording}
+              canTranscribe={!!transcriptionProvider}
+              isTranscribing={isTranscribingAudio}
            />
          )}

--- a/web/src/components/MemoEditor/services/errorService.ts
+++ b/web/src/components/MemoEditor/services/errorService.ts
@ -1,5 +1,9 @@
 export const errorService = {
  getErrorMessage(error: unknown): string {
+    if (error && typeof error === "object" && "rawMessage" in error) {
+      return (error as { rawMessage?: string }).rawMessage || "An error occurred";
+    }
+
    // Handle ConnectError or errors with details property
    if (error && typeof error === "object" && "details" in error) {
      return (error as { details?: string }).details || "An error occurred";
--- a/web/src/components/MemoEditor/services/index.ts
+++ b/web/src/components/MemoEditor/services/index.ts
@ -1,5 +1,6 @@
 export * from "./cacheService";
 export * from "./errorService";
 export * from "./memoService";
+export * from "./transcriptionService";
 export * from "./uploadService";
 export * from "./validationService";
--- a/web/src/components/MemoEditor/services/transcriptionService.ts
+++ b/web/src/components/MemoEditor/services/transcriptionService.ts
@ -0,0 +1,26 @@
+import { create } from "@bufbuild/protobuf";
+import { aiServiceClient } from "@/connect";
+import { TranscribeRequestSchema, TranscriptionAudioSchema, TranscriptionConfigSchema } from "@/types/proto/api/v1/ai_service_pb";
+import type { InstanceSetting_AIProviderConfig } from "@/types/proto/api/v1/instance_service_pb";
+
+export const transcriptionService = {
+  async transcribeFile(file: File, provider: InstanceSetting_AIProviderConfig): Promise<string> {
+    const content = new Uint8Array(await file.arrayBuffer());
+    const response = await aiServiceClient.transcribe(
+      create(TranscribeRequestSchema, {
+        providerId: provider.id,
+        config: create(TranscriptionConfigSchema, {}),
+        audio: create(TranscriptionAudioSchema, {
+          source: {
+            case: "content",
+            value: content,
+          },
+          filename: file.name,
+          contentType: file.type,
+        }),
+      }),
+    );
+
+    return response.text;
+  },
+};
--- a/web/src/components/MemoEditor/types/components.ts
+++ b/web/src/components/MemoEditor/types/components.ts
@ -36,6 +36,9 @@ export interface AudioRecorderPanelProps {
  mediaStream: MediaStream | null;
  onStop: () => void;
  onCancel: () => void;
+  onTranscribe?: () => void;
+  canTranscribe?: boolean;
+  isTranscribing?: boolean;
 }

 export interface FocusModeOverlayProps {
--- a/web/src/components/Settings/AISection.tsx
+++ b/web/src/components/Settings/AISection.tsx
@ -10,7 +10,6 @@ import { DropdownMenu, DropdownMenuContent, DropdownMenuItem, DropdownMenuTrigge
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
-import { Textarea } from "@/components/ui/textarea";
 import { useInstance } from "@/contexts/InstanceContext";
 import { handleError } from "@/lib/error";
 import {
@ -34,16 +33,11 @@ type LocalAIProvider = {
  apiKey: string;
  apiKeySet: boolean;
  apiKeyHint: string;
-  models: string[];
-  defaultModel: string;
 };

-const providerTypeOptions = [
-  InstanceSetting_AIProviderType.OPENAI,
-  InstanceSetting_AIProviderType.OPENAI_COMPATIBLE,
-  InstanceSetting_AIProviderType.GEMINI,
-  InstanceSetting_AIProviderType.ANTHROPIC,
-];
+const providerTypeOptions = [InstanceSetting_AIProviderType.OPENAI, InstanceSetting_AIProviderType.GEMINI];
+
+const byokNotes = ["setting.ai.byok-key-note", "setting.ai.byok-storage-note", "setting.ai.byok-model-note"] as const;

 const createProviderID = () => {
  if (typeof crypto !== "undefined" && "randomUUID" in crypto) {
@ -64,18 +58,8 @@ const toLocalProvider = (provider: InstanceSetting_AIProviderConfig): LocalAIPro
  apiKey: "",
  apiKeySet: provider.apiKeySet,
  apiKeyHint: provider.apiKeyHint,
-  models: [...provider.models],
-  defaultModel: provider.defaultModel,
 });

-const normalizeModels = (value: string) => {
-  const models = value
-    .split(/\r?\n/)
-    .map((model) => model.trim())
-    .filter(Boolean);
-  return Array.from(new Set(models));
-};
-
 const newProvider = (): LocalAIProvider => ({
  id: createProviderID(),
  title: "",
@ -84,8 +68,6 @@ const newProvider = (): LocalAIProvider => ({
  apiKey: "",
  apiKeySet: false,
  apiKeyHint: "",
-  models: [],
-  defaultModel: "",
 });

 const toProviderConfig = (provider: LocalAIProvider) =>
@ -95,8 +77,6 @@ const toProviderConfig = (provider: LocalAIProvider) =>
    type: provider.type,
    endpoint: provider.endpoint.trim(),
    apiKey: provider.apiKey,
-    models: provider.models,
-    defaultModel: provider.defaultModel.trim(),
  });

 const AISection = () => {
@ -124,36 +104,20 @@ const AISection = () => {
  const handleSaveProvider = (provider: LocalAIProvider) => {
    const title = provider.title.trim();
    const endpoint = provider.endpoint.trim();
-    const models = provider.models.map((model) => model.trim()).filter(Boolean);
-    const defaultModel = provider.defaultModel.trim() || models[0] || "";

    if (!title) {
      toast.error(t("setting.ai.provider-title-required"));
      return;
    }
-    if (provider.type === InstanceSetting_AIProviderType.OPENAI_COMPATIBLE && !endpoint) {
-      toast.error(t("setting.ai.endpoint-required"));
-      return;
-    }
    if (!provider.apiKeySet && !provider.apiKey.trim()) {
      toast.error(t("setting.ai.api-key-required"));
      return;
    }
-    if (models.length === 0) {
-      toast.error(t("setting.ai.models-required"));
-      return;
-    }
-    if (defaultModel && !models.includes(defaultModel)) {
-      toast.error(t("setting.ai.default-model-required"));
-      return;
-    }

    const normalizedProvider = {
      ...provider,
      title,
      endpoint,
-      models,
-      defaultModel,
    };
    setProviders((prev) => {
      const exists = prev.some((item) => item.id === normalizedProvider.id);
@ -203,6 +167,26 @@ const AISection = () => {
        </Button>
      }
    >
+      <section className="rounded-lg border border-border bg-muted/30 px-4 py-3">
+        <div className="flex max-w-3xl flex-col gap-2">
+          <div className="flex flex-wrap items-center gap-2">
+            <span className="rounded-md border border-border bg-background px-2 py-0.5 text-xs font-medium text-foreground">
+              {t("setting.ai.byok-label")}
+            </span>
+            <h4 className="text-sm font-semibold text-foreground">{t("setting.ai.byok-title")}</h4>
+          </div>
+          <p className="text-sm text-muted-foreground">{t("setting.ai.byok-description")}</p>
+          <ul className="space-y-1 text-sm text-muted-foreground">
+            {byokNotes.map((note) => (
+              <li key={note} className="flex gap-2">
+                <span className="mt-2 size-1 rounded-full bg-muted-foreground/60" aria-hidden />
+                <span>{t(note)}</span>
+              </li>
+            ))}
+          </ul>
+        </div>
+      </section>
+
      <SettingGroup title={t("setting.ai.providers")} description={t("setting.ai.description")}>
        <SettingTable
          columns={[
@ -222,13 +206,10 @@ const AISection = () => {
              render: (_, provider: LocalAIProvider) => <span>{getProviderTypeLabel(provider.type)}</span>,
            },
            {
-              key: "models",
-              header: t("setting.ai.models"),
+              key: "endpoint",
+              header: t("setting.ai.endpoint"),
              render: (_, provider: LocalAIProvider) => (
-                <div className="flex flex-col gap-0.5">
-                  <span className="text-foreground">{provider.defaultModel || provider.models[0] || "-"}</span>
-                  <span className="text-xs text-muted-foreground">{t("setting.ai.model-count", { count: provider.models.length })}</span>
-                </div>
+                <span className="font-mono text-xs">{provider.endpoint || t("setting.ai.default-endpoint")}</span>
              ),
            },
            {
@ -299,12 +280,10 @@ interface AIProviderDialogProps {
 const AIProviderDialog = ({ provider, onOpenChange, onSave }: AIProviderDialogProps) => {
  const t = useTranslate();
  const [draft, setDraft] = useState<LocalAIProvider>(() => provider ?? newProvider());
-  const [modelsText, setModelsText] = useState("");

  useEffect(() => {
    const next = provider ?? newProvider();
    setDraft(next);
-    setModelsText(next.models.join("\n"));
  }, [provider]);

  const updateDraft = (partial: Partial<LocalAIProvider>) => {
@ -312,10 +291,7 @@ const AIProviderDialog = ({ provider, onOpenChange, onSave }: AIProviderDialogPr
  };

  const handleSave = () => {
-    onSave({
-      ...draft,
-      models: normalizeModels(modelsText),
-    });
+    onSave(draft);
  };

  return (
@ -356,8 +332,9 @@ const AIProviderDialog = ({ provider, onOpenChange, onSave }: AIProviderDialogPr
            <Input
              value={draft.endpoint}
              onChange={(e) => updateDraft({ endpoint: e.target.value })}
-              placeholder={draft.type === InstanceSetting_AIProviderType.OPENAI ? "https://api.openai.com/v1" : "https://example.com/v1"}
+              placeholder={getDefaultEndpointPlaceholder(draft.type)}
            />
+            <p className="text-xs text-muted-foreground">{t("setting.ai.endpoint-hint")}</p>
          </div>

          <div className="flex flex-col gap-1.5 sm:col-span-2">
@ -372,26 +349,6 @@ const AIProviderDialog = ({ provider, onOpenChange, onSave }: AIProviderDialogPr
              <p className="text-xs text-muted-foreground">{t("setting.ai.current-key", { key: draft.apiKeyHint || "-" })}</p>
            )}
          </div>
-
-          <div className="flex flex-col gap-1.5 sm:col-span-2">
-            <Label>{t("setting.ai.models")}</Label>
-            <Textarea
-              className="font-mono text-sm min-h-28"
-              value={modelsText}
-              onChange={(e) => setModelsText(e.target.value)}
-              placeholder={"gpt-4o-transcribe\ngpt-4o-mini-transcribe"}
-            />
-            <p className="text-xs text-muted-foreground">{t("setting.ai.models-hint")}</p>
-          </div>
-
-          <div className="flex flex-col gap-1.5 sm:col-span-2">
-            <Label>{t("setting.ai.default-model")}</Label>
-            <Input
-              value={draft.defaultModel}
-              onChange={(e) => updateDraft({ defaultModel: e.target.value })}
-              placeholder={normalizeModels(modelsText)[0] ?? ""}
-            />
-          </div>
        </div>

        <DialogFooter>
@ -405,4 +362,15 @@ const AIProviderDialog = ({ provider, onOpenChange, onSave }: AIProviderDialogPr
  );
 };

+const getDefaultEndpointPlaceholder = (type: InstanceSetting_AIProviderType) => {
+  switch (type) {
+    case InstanceSetting_AIProviderType.OPENAI:
+      return "https://api.openai.com/v1";
+    case InstanceSetting_AIProviderType.GEMINI:
+      return "https://generativelanguage.googleapis.com/v1beta";
+    default:
+      return "";
+  }
+};
+
 export default AISection;
--- a/web/src/contexts/InstanceContext.tsx
+++ b/web/src/contexts/InstanceContext.tsx
@ -1,5 +1,5 @@
 import { create } from "@bufbuild/protobuf";
-import { createContext, type ReactNode, useCallback, useContext, useMemo, useState } from "react";
+import { createContext, type ReactNode, useCallback, useContext, useMemo, useRef, useState } from "react";
 import { instanceServiceClient } from "@/connect";
 import {
  InstanceProfile,
@ -58,6 +58,8 @@ export function InstanceProvider({ children }: { children: ReactNode }) {
    profileLoaded: false,
  });

+  const fetchedSettingsRef = useRef<Set<string>>(new Set());
+
  // Memoize derived settings to prevent unnecessary recalculations
  const generalSetting = useMemo((): InstanceSetting_GeneralSetting => {
    const setting = state.settings.find((s) => s.name === `${instanceSettingNamePrefix}GENERAL`);
@ -128,13 +130,21 @@ export function InstanceProvider({ children }: { children: ReactNode }) {
  }, []);

  const fetchSetting = useCallback(async (key: InstanceSetting_Key) => {
-    const setting = await instanceServiceClient.getInstanceSetting({
-      name: buildInstanceSettingName(key),
-    });
+    const name = buildInstanceSettingName(key);
+    if (fetchedSettingsRef.current.has(name)) {
+      return;
+    }
+    fetchedSettingsRef.current.add(name);
+    try {
+      const setting = await instanceServiceClient.getInstanceSetting({ name });
      setState((prev) => ({
        ...prev,
        settings: [...prev.settings.filter((s) => s.name !== setting.name), setting],
      }));
+    } catch (error) {
+      fetchedSettingsRef.current.delete(name);
+      throw error;
+    }
  }, []);

  const updateSetting = useCallback(async (setting: InstanceSetting) => {
--- a/web/src/locales/en.json
+++ b/web/src/locales/en.json
@ -166,6 +166,7 @@
    "audio-recorder": {
      "attachment-label": "Audio recording",
      "attachment-label-with-time": "Audio recording {{time}}",
+      "configure-ai-provider": "Configure an AI provider first",
      "discard": "Discard",
      "error": "Microphone unavailable",
      "error-description": "Try again after checking microphone access for this site.",
@ -184,6 +185,11 @@
      "start": "Start recording",
      "stop": "Stop recording",
      "title": "Audio recorder",
+      "transcribe": "Transcribe",
+      "transcribe-empty": "No speech detected",
+      "transcribe-error": "Failed to transcribe audio",
+      "transcribe-success": "Transcription added",
+      "transcribing": "Transcribing...",
      "trigger": "Record audio",
      "unsupported": "Audio recording unsupported",
      "unsupported-description": "This browser cannot record audio from the memo composer."
@ -390,22 +396,23 @@
      "add-provider": "Add provider",
      "api-key": "API key",
      "api-key-required": "API key is required.",
+      "byok-description": "Connect OpenAI or Gemini with an API key from your own account. Memos calls the provider directly from this server.",
+      "byok-key-note": "Use a key from your provider account; Memos does not bundle shared AI credentials.",
+      "byok-label": "BYOK",
+      "byok-model-note": "Memos selects supported models for features like audio transcription.",
+      "byok-storage-note": "Keys stay on this instance and are masked when settings are loaded.",
+      "byok-title": "Use your own AI account",
      "configured": "Configured",
      "current-key": "Current key: {{key}}",
-      "default-model": "Default model",
-      "default-model-required": "Default model must be listed in models.",
+      "default-endpoint": "Default endpoint",
      "delete-provider": "Delete AI provider `{{title}}`?",
-      "description": "Configure instance-wide AI providers available to server-side AI features.",
-      "dialog-description": "Models are entered manually. Leave the API key blank while editing to keep the stored key.",
+      "description": "Provider keys are supplied by the instance owner and used by server-side AI features.",
+      "dialog-description": "Add a key from your own provider account. Memos uses built-in models for each provider; leave the API key blank while editing to keep the stored key.",
      "edit-provider": "Edit provider",
      "endpoint": "Endpoint",
-      "endpoint-required": "Endpoint is required for OpenAI-compatible providers.",
+      "endpoint-hint": "Leave empty to use the official provider endpoint.",
      "keep-api-key": "Leave blank to keep the existing key",
      "label": "AI",
-      "model-count": "{{count}} models",
-      "models": "Models",
-      "models-hint": "Enter one model per line.",
-      "models-required": "At least one model is required.",
      "no-providers": "No AI providers configured.",
      "provider-title": "Provider name",
      "provider-title-required": "Provider name is required.",
--- a/web/src/locales/zh-Hans.json
+++ b/web/src/locales/zh-Hans.json
@ -133,6 +133,7 @@
    "audio-recorder": {
      "attachment-label": "录音",
      "attachment-label-with-time": "录音 {{time}}",
+      "configure-ai-provider": "请先配置 AI provider",
      "discard": "丢弃",
      "error": "麦克风不可用",
      "error-description": "检查此站点的麦克风访问权限后重试。",
@ -151,6 +152,11 @@
      "start": "开始录音",
      "stop": "停止录音",
      "title": "录音机",
+      "transcribe": "转为文字",
+      "transcribe-empty": "未检测到语音",
+      "transcribe-error": "音频转写失败",
+      "transcribe-success": "转写内容已加入",
+      "transcribing": "转写中...",
      "trigger": "录制音频",
      "unsupported": "不支持录音",
      "unsupported-description": "此浏览器无法录制备忘录编辑器中的音频。"
@ -333,6 +339,33 @@
    "my-account": {
      "label": "我的账号"
    },
+    "ai": {
+      "add-provider": "添加 provider",
+      "api-key": "API key",
+      "api-key-required": "API key 不能为空。",
+      "byok-description": "使用你自己的 OpenAI 或 Gemini 账号 API key。Memos 会从本实例服务器直接调用 provider。",
+      "byok-key-note": "使用你在 provider 账号中创建的 key；Memos 不提供共享 AI 凭据。",
+      "byok-label": "BYOK",
+      "byok-model-note": "Memos 会为音频转写等功能选择内置支持的模型。",
+      "byok-storage-note": "Key 保存在本实例中，加载设置时只返回脱敏提示。",
+      "byok-title": "使用你自己的 AI 账号",
+      "configured": "已配置",
+      "current-key": "当前 key：{{key}}",
+      "default-endpoint": "默认端点",
+      "delete-provider": "删除 AI provider `{{title}}`？",
+      "description": "Provider key 由实例所有者提供，并用于服务端 AI 功能。",
+      "dialog-description": "添加你自己的 provider 账号 key。Memos 会为每个 provider 使用内置模型；编辑时留空 API key 可保留已保存的 key。",
+      "edit-provider": "编辑 provider",
+      "endpoint": "端点",
+      "endpoint-hint": "留空则使用官方 provider 端点。",
+      "keep-api-key": "留空以保留已保存的 key",
+      "label": "AI",
+      "no-providers": "尚未配置 AI provider。",
+      "provider-title": "Provider 名称",
+      "provider-title-required": "Provider 名称不能为空。",
+      "provider-type": "Provider 类型",
+      "providers": "Providers"
+    },
    "preference": {
      "default-memo-sort-option": "备忘录显示时间",
      "default-memo-visibility": "默认备忘录可见性",
--- a/web/src/locales/zh-Hant.json
+++ b/web/src/locales/zh-Hant.json
@ -133,6 +133,7 @@
    "audio-recorder": {
      "attachment-label": "錄音",
      "attachment-label-with-time": "錄音 {{time}}",
+      "configure-ai-provider": "請先配置 AI provider",
      "discard": "丟棄",
      "error": "麥克風不可用",
      "error-description": "檢查此網站的麥克風存取權限後重試。",
@ -151,6 +152,11 @@
      "start": "開始錄音",
      "stop": "停止錄音",
      "title": "錄音機",
+      "transcribe": "轉為文字",
+      "transcribe-empty": "未偵測到語音",
+      "transcribe-error": "音訊轉寫失敗",
+      "transcribe-success": "轉寫內容已加入",
+      "transcribing": "轉寫中...",
      "trigger": "錄製音訊",
      "unsupported": "不支援錄音",
      "unsupported-description": "此瀏覽器無法錄製備忘錄編輯器中的音訊。"
@ -333,6 +339,33 @@
    "my-account": {
      "label": "我的帳號"
    },
+    "ai": {
+      "add-provider": "新增 provider",
+      "api-key": "API key",
+      "api-key-required": "API key 不可為空。",
+      "byok-description": "使用你自己的 OpenAI 或 Gemini 帳號 API key。Memos 會從本實例伺服器直接呼叫 provider。",
+      "byok-key-note": "使用你在 provider 帳號中建立的 key；Memos 不提供共享 AI 憑證。",
+      "byok-label": "BYOK",
+      "byok-model-note": "Memos 會為音訊轉寫等功能選擇內建支援的模型。",
+      "byok-storage-note": "Key 保存在本實例中，載入設定時只返回脫敏提示。",
+      "byok-title": "使用你自己的 AI 帳號",
+      "configured": "已設定",
+      "current-key": "目前 key：{{key}}",
+      "default-endpoint": "預設端點",
+      "delete-provider": "刪除 AI provider `{{title}}`？",
+      "description": "Provider key 由實例擁有者提供，並用於伺服器端 AI 功能。",
+      "dialog-description": "新增你自己的 provider 帳號 key。Memos 會為每個 provider 使用內建模型；編輯時留空 API key 可保留已保存的 key。",
+      "edit-provider": "編輯 provider",
+      "endpoint": "端點",
+      "endpoint-hint": "留空則使用官方 provider 端點。",
+      "keep-api-key": "留空以保留已保存的 key",
+      "label": "AI",
+      "no-providers": "尚未設定 AI provider。",
+      "provider-title": "Provider 名稱",
+      "provider-title-required": "Provider 名稱不可為空。",
+      "provider-type": "Provider 類型",
+      "providers": "Providers"
+    },
    "preference": {
      "default-memo-sort-option": "備忘錄顯示時間",
      "default-memo-visibility": "備忘錄預設瀏覽權限",
--- a/web/src/pages/Setting.tsx
+++ b/web/src/pages/Setting.tsx
@ -1,7 +1,7 @@
 import {
-  BotIcon,
  CogIcon,
  DatabaseIcon,
+  HeartHandshakeIcon,
  KeyIcon,
  LibraryIcon,
  LucideIcon,
@ -48,7 +48,7 @@ const SECTION_ICON_MAP: Record<SettingSection, LucideIcon> = {
  storage: DatabaseIcon,
  tags: TagsIcon,
  sso: KeyIcon,
-  ai: BotIcon,
+  ai: HeartHandshakeIcon,
 };

 const SECTION_COMPONENT_MAP: Record<SettingSection, React.ComponentType> = {
--- a/web/src/types/proto/api/v1/ai_service_pb.ts
+++ b/web/src/types/proto/api/v1/ai_service_pb.ts
@ -13,7 +13,7 @@ import type { Message } from "@bufbuild/protobuf";
 * Describes the file api/v1/ai_service.proto.
 */
 export const file_api_v1_ai_service: GenFile = /*@__PURE__*/
-  fileDesc("ChdhcGkvdjEvYWlfc2VydmljZS5wcm90bxIMbWVtb3MuYXBpLnYxIpsBChFUcmFuc2NyaWJlUmVxdWVzdBIYCgtwcm92aWRlcl9pZBgBIAEoCUID4EECEjYKBmNvbmZpZxgCIAEoCzIhLm1lbW9zLmFwaS52MS5UcmFuc2NyaXB0aW9uQ29uZmlnQgPgQQISNAoFYXVkaW8YAyABKAsyIC5tZW1vcy5hcGkudjEuVHJhbnNjcmlwdGlvbkF1ZGlvQgPgQQIiVQoTVHJhbnNjcmlwdGlvbkNvbmZpZxISCgVtb2RlbBgBIAEoCUID4EEBEhMKBnByb21wdBgCIAEoCUID4EEBEhUKCGxhbmd1YWdlGAMgASgJQgPgQQEidwoSVHJhbnNjcmlwdGlvbkF1ZGlvEhYKB2NvbnRlbnQYASABKAxCA+BBBEgAEg0KA3VyaRgCIAEoCUgAEhUKCGZpbGVuYW1lGAMgASgJQgPgQQESGQoMY29udGVudF90eXBlGAQgASgJQgPgQQFCCAoGc291cmNlIiIKElRyYW5zY3JpYmVSZXNwb25zZRIMCgR0ZXh0GAEgASgJMpoBCglBSVNlcnZpY2USjAEKClRyYW5zY3JpYmUSHy5tZW1vcy5hcGkudjEuVHJhbnNjcmliZVJlcXVlc3QaIC5tZW1vcy5hcGkudjEuVHJhbnNjcmliZVJlc3BvbnNlIjvaQRhwcm92aWRlcl9pZCxjb25maWcsYXVkaW+C0+STAho6ASoiFS9hcGkvdjEvYWk6dHJhbnNjcmliZUKmAQoQY29tLm1lbW9zLmFwaS52MUIOQWlTZXJ2aWNlUHJvdG9QAVowZ2l0aHViLmNvbS91c2VtZW1vcy9tZW1vcy9wcm90by9nZW4vYXBpL3YxO2FwaXYxogIDTUFYqgIMTWVtb3MuQXBpLlYxygIMTWVtb3NcQXBpXFYx4gIYTWVtb3NcQXBpXFYxXEdQQk1ldGFkYXRh6gIOTWVtb3M6OkFwaTo6VjFiBnByb3RvMw", [file_google_api_annotations, file_google_api_client, file_google_api_field_behavior]);
+  fileDesc("ChdhcGkvdjEvYWlfc2VydmljZS5wcm90bxIMbWVtb3MuYXBpLnYxIpsBChFUcmFuc2NyaWJlUmVxdWVzdBIYCgtwcm92aWRlcl9pZBgBIAEoCUID4EECEjYKBmNvbmZpZxgCIAEoCzIhLm1lbW9zLmFwaS52MS5UcmFuc2NyaXB0aW9uQ29uZmlnQgPgQQISNAoFYXVkaW8YAyABKAsyIC5tZW1vcy5hcGkudjEuVHJhbnNjcmlwdGlvbkF1ZGlvQgPgQQIiQQoTVHJhbnNjcmlwdGlvbkNvbmZpZxITCgZwcm9tcHQYASABKAlCA+BBARIVCghsYW5ndWFnZRgCIAEoCUID4EEBIncKElRyYW5zY3JpcHRpb25BdWRpbxIWCgdjb250ZW50GAEgASgMQgPgQQRIABINCgN1cmkYAiABKAlIABIVCghmaWxlbmFtZRgDIAEoCUID4EEBEhkKDGNvbnRlbnRfdHlwZRgEIAEoCUID4EEBQggKBnNvdXJjZSIiChJUcmFuc2NyaWJlUmVzcG9uc2USDAoEdGV4dBgBIAEoCTKaAQoJQUlTZXJ2aWNlEowBCgpUcmFuc2NyaWJlEh8ubWVtb3MuYXBpLnYxLlRyYW5zY3JpYmVSZXF1ZXN0GiAubWVtb3MuYXBpLnYxLlRyYW5zY3JpYmVSZXNwb25zZSI72kEYcHJvdmlkZXJfaWQsY29uZmlnLGF1ZGlvgtPkkwIaOgEqIhUvYXBpL3YxL2FpOnRyYW5zY3JpYmVCpgEKEGNvbS5tZW1vcy5hcGkudjFCDkFpU2VydmljZVByb3RvUAFaMGdpdGh1Yi5jb20vdXNlbWVtb3MvbWVtb3MvcHJvdG8vZ2VuL2FwaS92MTthcGl2MaICA01BWKoCDE1lbW9zLkFwaS5WMcoCDE1lbW9zXEFwaVxWMeICGE1lbW9zXEFwaVxWMVxHUEJNZXRhZGF0YeoCDk1lbW9zOjpBcGk6OlYxYgZwcm90bzM", [file_google_api_annotations, file_google_api_client, file_google_api_field_behavior]);

 /**
 * @generated from message memos.api.v1.TranscribeRequest
@ -52,24 +52,17 @@ export const TranscribeRequestSchema: GenMessage<TranscribeRequest> = /*@__PURE_
 * @generated from message memos.api.v1.TranscriptionConfig
 */
 export type TranscriptionConfig = Message<"memos.api.v1.TranscriptionConfig"> & {
-  /**
-   * Optional. The model to use. If empty, the provider's default model is used.
-   *
-   * @generated from field: string model = 1;
-   */
-  model: string;
-
  /**
   * Optional. A prompt to improve transcription quality.
   *
-   * @generated from field: string prompt = 2;
+   * @generated from field: string prompt = 1;
   */
  prompt: string;

  /**
   * Optional. The language of the input audio.
   *
-   * @generated from field: string language = 3;
+   * @generated from field: string language = 2;
   */
  language: string;
 };
--- a/web/src/types/proto/api/v1/instance_service_pb.ts
+++ b/web/src/types/proto/api/v1/instance_service_pb.ts