feat(metadata): enhance metadata handling by sanitizing extracted data and updating documentation for file storage
This commit is contained in:
@@ -90,7 +90,7 @@ func (t *CaptureTool) Handle(ctx context.Context, req *mcp.CallToolRequest, in C
|
||||
thought := thoughttypes.Thought{
|
||||
Content: content,
|
||||
Embedding: embedding,
|
||||
Metadata: metadata.Normalize(rawMetadata, t.capture),
|
||||
Metadata: metadata.Normalize(metadata.SanitizeExtracted(rawMetadata), t.capture),
|
||||
}
|
||||
if project != nil {
|
||||
thought.ProjectID = &project.ID
|
||||
|
||||
@@ -243,11 +243,32 @@ func splitDataURL(value string) (contentBase64 string, mediaType string) {
|
||||
}
|
||||
|
||||
func decodeBase64(value string) ([]byte, error) {
|
||||
decoded, err := base64.StdEncoding.DecodeString(value)
|
||||
if err == nil {
|
||||
return decoded, nil
|
||||
cleaned := strings.Map(func(r rune) rune {
|
||||
switch r {
|
||||
case ' ', '\t', '\n', '\r':
|
||||
return -1
|
||||
default:
|
||||
return r
|
||||
}
|
||||
}, value)
|
||||
|
||||
encodings := []*base64.Encoding{
|
||||
base64.StdEncoding,
|
||||
base64.RawStdEncoding,
|
||||
base64.URLEncoding,
|
||||
base64.RawURLEncoding,
|
||||
}
|
||||
return base64.RawStdEncoding.DecodeString(value)
|
||||
|
||||
var lastErr error
|
||||
for _, encoding := range encodings {
|
||||
decoded, err := encoding.DecodeString(cleaned)
|
||||
if err == nil {
|
||||
return decoded, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func normalizeMediaType(explicit string, fromDataURL string, content []byte) string {
|
||||
|
||||
28
internal/tools/files_test.go
Normal file
28
internal/tools/files_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package tools
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDecodeBase64AcceptsWhitespaceAndMultipleVariants(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{name: "standard with whitespace", input: "aG V s\nbG8=", want: "hello"},
|
||||
{name: "raw standard", input: "aGVsbG8", want: "hello"},
|
||||
{name: "standard url-safe payload", input: "--8=", want: string([]byte{0xfb, 0xef})},
|
||||
{name: "raw url-safe payload", input: "--8", want: string([]byte{0xfb, 0xef})},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, err := decodeBase64(tc.input)
|
||||
if err != nil {
|
||||
t.Fatalf("decodeBase64(%q) error = %v", tc.input, err)
|
||||
}
|
||||
if string(got) != tc.want {
|
||||
t.Fatalf("decodeBase64(%q) = %q, want %q", tc.input, string(got), tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -82,13 +82,7 @@ func (t *RetryMetadataTool) Handle(ctx context.Context, req *mcp.CallToolRequest
|
||||
|
||||
func (r *MetadataRetryer) QueueThought(id uuid.UUID) {
|
||||
go func() {
|
||||
attemptCtx := r.backgroundCtx
|
||||
if r.metadataTimeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
attemptCtx, cancel = context.WithTimeout(r.backgroundCtx, r.metadataTimeout)
|
||||
defer cancel()
|
||||
}
|
||||
if _, err := r.retryOne(attemptCtx, id); err != nil {
|
||||
if _, err := r.retryOne(r.backgroundCtx, id); err != nil {
|
||||
r.logger.Warn("background metadata retry failed", slog.String("thought_id", id.String()), slog.String("error", err.Error()))
|
||||
}
|
||||
}()
|
||||
@@ -196,7 +190,7 @@ func (r *MetadataRetryer) retryOne(ctx context.Context, id uuid.UUID) (bool, err
|
||||
return false, extractErr
|
||||
}
|
||||
|
||||
completedMetadata := metadata.MarkMetadataComplete(extracted, r.capture, attemptedAt)
|
||||
completedMetadata := metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), r.capture, attemptedAt)
|
||||
completedMetadata.Attachments = thought.Metadata.Attachments
|
||||
if _, updateErr := r.store.UpdateThoughtMetadata(ctx, thought.ID, completedMetadata); updateErr != nil {
|
||||
return false, updateErr
|
||||
|
||||
@@ -116,7 +116,7 @@ func (t *ReparseMetadataTool) Handle(ctx context.Context, req *mcp.CallToolReque
|
||||
mu.Unlock()
|
||||
t.logger.Warn("metadata reparse extract failed, using normalized existing metadata", slog.String("thought_id", thought.ID.String()), slog.String("error", extractErr.Error()))
|
||||
} else {
|
||||
normalizedTarget = metadata.MarkMetadataComplete(extracted, t.capture, attemptedAt)
|
||||
normalizedTarget = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, attemptedAt)
|
||||
normalizedTarget.Attachments = thought.Metadata.Attachments
|
||||
mu.Lock()
|
||||
out.Reparsed++
|
||||
|
||||
@@ -67,7 +67,7 @@ func (t *UpdateTool) Handle(ctx context.Context, _ *mcp.CallToolRequest, in Upda
|
||||
t.log.Warn("metadata extraction failed during update, keeping current metadata", slog.String("error", extractErr.Error()))
|
||||
mergedMetadata = metadata.MarkMetadataFailed(mergedMetadata, t.capture, time.Now().UTC(), extractErr)
|
||||
} else {
|
||||
mergedMetadata = metadata.MarkMetadataComplete(extracted, t.capture, time.Now().UTC())
|
||||
mergedMetadata = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, time.Now().UTC())
|
||||
mergedMetadata.Attachments = current.Metadata.Attachments
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user