feat(metadata): enhance metadata handling by sanitizing extracted data and updating documentation for file storage

This commit is contained in:
2026-03-30 23:14:08 +02:00
parent 72b4f7ce3d
commit e6f00ce636
11 changed files with 108 additions and 17 deletions

View File

@@ -252,6 +252,10 @@ func (c *Client) ExtractMetadata(ctx context.Context, input string) (thoughttype
err = fallbackErr
}
if ctx.Err() != nil {
return thoughttypes.ThoughtMetadata{}, fmt.Errorf("%s metadata: %w", c.name, ctx.Err())
}
heuristic := heuristicMetadataFromInput(input)
if c.log != nil {
c.log.Warn("metadata extraction failed for all models, using heuristic fallback",

View File

@@ -110,6 +110,15 @@ func MarkMetadataComplete(base thoughttypes.ThoughtMetadata, capture config.Capt
return out
}
func SanitizeExtracted(in thoughttypes.ThoughtMetadata) thoughttypes.ThoughtMetadata {
in.Attachments = nil
in.MetadataStatus = ""
in.MetadataUpdatedAt = ""
in.MetadataLastAttemptedAt = ""
in.MetadataError = ""
return in
}
func normalizeList(values []string, limit int) []string {
seen := make(map[string]struct{}, len(values))
result := make([]string, 0, len(values))

View File

@@ -107,6 +107,38 @@ func TestNormalizeDedupesAttachmentsByFileID(t *testing.T) {
}
}
func TestSanitizeExtractedDropsAttachmentsAndMetadataControlFields(t *testing.T) {
id := uuid.New()
got := SanitizeExtracted(thoughttypes.ThoughtMetadata{
Type: "idea",
Attachments: []thoughttypes.ThoughtAttachment{{FileID: id, Name: "secret.pdf"}},
MetadataStatus: MetadataStatusFailed,
MetadataUpdatedAt: "2026-03-30T10:00:00Z",
MetadataLastAttemptedAt: "2026-03-30T10:01:00Z",
MetadataError: "boom",
})
if len(got.Attachments) != 0 {
t.Fatalf("Attachments len = %d, want 0", len(got.Attachments))
}
if got.MetadataStatus != "" {
t.Fatalf("MetadataStatus = %q, want empty", got.MetadataStatus)
}
if got.MetadataUpdatedAt != "" {
t.Fatalf("MetadataUpdatedAt = %q, want empty", got.MetadataUpdatedAt)
}
if got.MetadataLastAttemptedAt != "" {
t.Fatalf("MetadataLastAttemptedAt = %q, want empty", got.MetadataLastAttemptedAt)
}
if got.MetadataError != "" {
t.Fatalf("MetadataError = %q, want empty", got.MetadataError)
}
if got.Type != "idea" {
t.Fatalf("Type = %q, want idea", got.Type)
}
}
func TestMarkMetadataPendingTracksAttemptWithoutClearingPreviousSuccess(t *testing.T) {
attempt := time.Date(2026, 3, 30, 10, 0, 0, 0, time.UTC)
base := thoughttypes.ThoughtMetadata{

View File

@@ -90,7 +90,7 @@ func (t *CaptureTool) Handle(ctx context.Context, req *mcp.CallToolRequest, in C
thought := thoughttypes.Thought{
Content: content,
Embedding: embedding,
Metadata: metadata.Normalize(rawMetadata, t.capture),
Metadata: metadata.Normalize(metadata.SanitizeExtracted(rawMetadata), t.capture),
}
if project != nil {
thought.ProjectID = &project.ID

View File

@@ -243,11 +243,32 @@ func splitDataURL(value string) (contentBase64 string, mediaType string) {
}
func decodeBase64(value string) ([]byte, error) {
decoded, err := base64.StdEncoding.DecodeString(value)
if err == nil {
return decoded, nil
cleaned := strings.Map(func(r rune) rune {
switch r {
case ' ', '\t', '\n', '\r':
return -1
default:
return r
}
}, value)
encodings := []*base64.Encoding{
base64.StdEncoding,
base64.RawStdEncoding,
base64.URLEncoding,
base64.RawURLEncoding,
}
return base64.RawStdEncoding.DecodeString(value)
var lastErr error
for _, encoding := range encodings {
decoded, err := encoding.DecodeString(cleaned)
if err == nil {
return decoded, nil
}
lastErr = err
}
return nil, lastErr
}
func normalizeMediaType(explicit string, fromDataURL string, content []byte) string {

View File

@@ -0,0 +1,28 @@
package tools
import "testing"
func TestDecodeBase64AcceptsWhitespaceAndMultipleVariants(t *testing.T) {
tests := []struct {
name string
input string
want string
}{
{name: "standard with whitespace", input: "aG V s\nbG8=", want: "hello"},
{name: "raw standard", input: "aGVsbG8", want: "hello"},
{name: "standard url-safe payload", input: "--8=", want: string([]byte{0xfb, 0xef})},
{name: "raw url-safe payload", input: "--8", want: string([]byte{0xfb, 0xef})},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := decodeBase64(tc.input)
if err != nil {
t.Fatalf("decodeBase64(%q) error = %v", tc.input, err)
}
if string(got) != tc.want {
t.Fatalf("decodeBase64(%q) = %q, want %q", tc.input, string(got), tc.want)
}
})
}
}

View File

@@ -82,13 +82,7 @@ func (t *RetryMetadataTool) Handle(ctx context.Context, req *mcp.CallToolRequest
func (r *MetadataRetryer) QueueThought(id uuid.UUID) {
go func() {
attemptCtx := r.backgroundCtx
if r.metadataTimeout > 0 {
var cancel context.CancelFunc
attemptCtx, cancel = context.WithTimeout(r.backgroundCtx, r.metadataTimeout)
defer cancel()
}
if _, err := r.retryOne(attemptCtx, id); err != nil {
if _, err := r.retryOne(r.backgroundCtx, id); err != nil {
r.logger.Warn("background metadata retry failed", slog.String("thought_id", id.String()), slog.String("error", err.Error()))
}
}()
@@ -196,7 +190,7 @@ func (r *MetadataRetryer) retryOne(ctx context.Context, id uuid.UUID) (bool, err
return false, extractErr
}
completedMetadata := metadata.MarkMetadataComplete(extracted, r.capture, attemptedAt)
completedMetadata := metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), r.capture, attemptedAt)
completedMetadata.Attachments = thought.Metadata.Attachments
if _, updateErr := r.store.UpdateThoughtMetadata(ctx, thought.ID, completedMetadata); updateErr != nil {
return false, updateErr

View File

@@ -116,7 +116,7 @@ func (t *ReparseMetadataTool) Handle(ctx context.Context, req *mcp.CallToolReque
mu.Unlock()
t.logger.Warn("metadata reparse extract failed, using normalized existing metadata", slog.String("thought_id", thought.ID.String()), slog.String("error", extractErr.Error()))
} else {
normalizedTarget = metadata.MarkMetadataComplete(extracted, t.capture, attemptedAt)
normalizedTarget = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, attemptedAt)
normalizedTarget.Attachments = thought.Metadata.Attachments
mu.Lock()
out.Reparsed++

View File

@@ -67,7 +67,7 @@ func (t *UpdateTool) Handle(ctx context.Context, _ *mcp.CallToolRequest, in Upda
t.log.Warn("metadata extraction failed during update, keeping current metadata", slog.String("error", extractErr.Error()))
mergedMetadata = metadata.MarkMetadataFailed(mergedMetadata, t.capture, time.Now().UTC(), extractErr)
} else {
mergedMetadata = metadata.MarkMetadataComplete(extracted, t.capture, time.Now().UTC())
mergedMetadata = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, time.Now().UTC())
mergedMetadata.Attachments = current.Metadata.Attachments
}
}