From e6f00ce6362248ac393388d55fccfa8ab505bc4b Mon Sep 17 00:00:00 2001 From: "Hein (Warky)" Date: Mon, 30 Mar 2026 23:14:08 +0200 Subject: [PATCH] feat(metadata): enhance metadata handling by sanitizing extracted data and updating documentation for file storage --- README.md | 2 +- internal/ai/compat/client.go | 4 ++++ internal/metadata/normalize.go | 9 ++++++++ internal/metadata/normalize_test.go | 32 +++++++++++++++++++++++++++++ internal/tools/capture.go | 2 +- internal/tools/files.go | 29 ++++++++++++++++++++++---- internal/tools/files_test.go | 28 +++++++++++++++++++++++++ internal/tools/metadata_retry.go | 10 ++------- internal/tools/reparse_metadata.go | 2 +- internal/tools/update.go | 2 +- llm/memory.md | 5 ++++- 11 files changed, 108 insertions(+), 17 deletions(-) create mode 100644 internal/tools/files_test.go diff --git a/README.md b/README.md index cbb576a..8d49dc6 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ Run `reparse_thought_metadata` to fix stale or inconsistent metadata by re-extra ## File Storage -Use `save_file` to persist binary files as base64. Files can optionally be linked to a memory by passing `thought_id`, which also adds an attachment reference to that thought's metadata. +Use `save_file` to persist binary files as base64. Files can optionally be linked to a memory by passing `thought_id`, which also adds an attachment reference to that thought's metadata. AI clients should prefer `save_file` when the goal is to retain the artifact itself, rather than reading or summarizing the file first. Stored files and attachment metadata are not forwarded to the metadata extraction client. ```json { diff --git a/internal/ai/compat/client.go b/internal/ai/compat/client.go index 028a9bf..1acdd6b 100644 --- a/internal/ai/compat/client.go +++ b/internal/ai/compat/client.go @@ -252,6 +252,10 @@ func (c *Client) ExtractMetadata(ctx context.Context, input string) (thoughttype err = fallbackErr } + if ctx.Err() != nil { + return thoughttypes.ThoughtMetadata{}, fmt.Errorf("%s metadata: %w", c.name, ctx.Err()) + } + heuristic := heuristicMetadataFromInput(input) if c.log != nil { c.log.Warn("metadata extraction failed for all models, using heuristic fallback", diff --git a/internal/metadata/normalize.go b/internal/metadata/normalize.go index 7237c2e..51cd6dd 100644 --- a/internal/metadata/normalize.go +++ b/internal/metadata/normalize.go @@ -110,6 +110,15 @@ func MarkMetadataComplete(base thoughttypes.ThoughtMetadata, capture config.Capt return out } +func SanitizeExtracted(in thoughttypes.ThoughtMetadata) thoughttypes.ThoughtMetadata { + in.Attachments = nil + in.MetadataStatus = "" + in.MetadataUpdatedAt = "" + in.MetadataLastAttemptedAt = "" + in.MetadataError = "" + return in +} + func normalizeList(values []string, limit int) []string { seen := make(map[string]struct{}, len(values)) result := make([]string, 0, len(values)) diff --git a/internal/metadata/normalize_test.go b/internal/metadata/normalize_test.go index aa75b54..f191bc7 100644 --- a/internal/metadata/normalize_test.go +++ b/internal/metadata/normalize_test.go @@ -107,6 +107,38 @@ func TestNormalizeDedupesAttachmentsByFileID(t *testing.T) { } } +func TestSanitizeExtractedDropsAttachmentsAndMetadataControlFields(t *testing.T) { + id := uuid.New() + + got := SanitizeExtracted(thoughttypes.ThoughtMetadata{ + Type: "idea", + Attachments: []thoughttypes.ThoughtAttachment{{FileID: id, Name: "secret.pdf"}}, + MetadataStatus: MetadataStatusFailed, + MetadataUpdatedAt: "2026-03-30T10:00:00Z", + MetadataLastAttemptedAt: "2026-03-30T10:01:00Z", + MetadataError: "boom", + }) + + if len(got.Attachments) != 0 { + t.Fatalf("Attachments len = %d, want 0", len(got.Attachments)) + } + if got.MetadataStatus != "" { + t.Fatalf("MetadataStatus = %q, want empty", got.MetadataStatus) + } + if got.MetadataUpdatedAt != "" { + t.Fatalf("MetadataUpdatedAt = %q, want empty", got.MetadataUpdatedAt) + } + if got.MetadataLastAttemptedAt != "" { + t.Fatalf("MetadataLastAttemptedAt = %q, want empty", got.MetadataLastAttemptedAt) + } + if got.MetadataError != "" { + t.Fatalf("MetadataError = %q, want empty", got.MetadataError) + } + if got.Type != "idea" { + t.Fatalf("Type = %q, want idea", got.Type) + } +} + func TestMarkMetadataPendingTracksAttemptWithoutClearingPreviousSuccess(t *testing.T) { attempt := time.Date(2026, 3, 30, 10, 0, 0, 0, time.UTC) base := thoughttypes.ThoughtMetadata{ diff --git a/internal/tools/capture.go b/internal/tools/capture.go index d44e588..e334975 100644 --- a/internal/tools/capture.go +++ b/internal/tools/capture.go @@ -90,7 +90,7 @@ func (t *CaptureTool) Handle(ctx context.Context, req *mcp.CallToolRequest, in C thought := thoughttypes.Thought{ Content: content, Embedding: embedding, - Metadata: metadata.Normalize(rawMetadata, t.capture), + Metadata: metadata.Normalize(metadata.SanitizeExtracted(rawMetadata), t.capture), } if project != nil { thought.ProjectID = &project.ID diff --git a/internal/tools/files.go b/internal/tools/files.go index 76f77b1..35045c5 100644 --- a/internal/tools/files.go +++ b/internal/tools/files.go @@ -243,11 +243,32 @@ func splitDataURL(value string) (contentBase64 string, mediaType string) { } func decodeBase64(value string) ([]byte, error) { - decoded, err := base64.StdEncoding.DecodeString(value) - if err == nil { - return decoded, nil + cleaned := strings.Map(func(r rune) rune { + switch r { + case ' ', '\t', '\n', '\r': + return -1 + default: + return r + } + }, value) + + encodings := []*base64.Encoding{ + base64.StdEncoding, + base64.RawStdEncoding, + base64.URLEncoding, + base64.RawURLEncoding, } - return base64.RawStdEncoding.DecodeString(value) + + var lastErr error + for _, encoding := range encodings { + decoded, err := encoding.DecodeString(cleaned) + if err == nil { + return decoded, nil + } + lastErr = err + } + + return nil, lastErr } func normalizeMediaType(explicit string, fromDataURL string, content []byte) string { diff --git a/internal/tools/files_test.go b/internal/tools/files_test.go new file mode 100644 index 0000000..66aa06c --- /dev/null +++ b/internal/tools/files_test.go @@ -0,0 +1,28 @@ +package tools + +import "testing" + +func TestDecodeBase64AcceptsWhitespaceAndMultipleVariants(t *testing.T) { + tests := []struct { + name string + input string + want string + }{ + {name: "standard with whitespace", input: "aG V s\nbG8=", want: "hello"}, + {name: "raw standard", input: "aGVsbG8", want: "hello"}, + {name: "standard url-safe payload", input: "--8=", want: string([]byte{0xfb, 0xef})}, + {name: "raw url-safe payload", input: "--8", want: string([]byte{0xfb, 0xef})}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := decodeBase64(tc.input) + if err != nil { + t.Fatalf("decodeBase64(%q) error = %v", tc.input, err) + } + if string(got) != tc.want { + t.Fatalf("decodeBase64(%q) = %q, want %q", tc.input, string(got), tc.want) + } + }) + } +} diff --git a/internal/tools/metadata_retry.go b/internal/tools/metadata_retry.go index 39f79ee..c49e356 100644 --- a/internal/tools/metadata_retry.go +++ b/internal/tools/metadata_retry.go @@ -82,13 +82,7 @@ func (t *RetryMetadataTool) Handle(ctx context.Context, req *mcp.CallToolRequest func (r *MetadataRetryer) QueueThought(id uuid.UUID) { go func() { - attemptCtx := r.backgroundCtx - if r.metadataTimeout > 0 { - var cancel context.CancelFunc - attemptCtx, cancel = context.WithTimeout(r.backgroundCtx, r.metadataTimeout) - defer cancel() - } - if _, err := r.retryOne(attemptCtx, id); err != nil { + if _, err := r.retryOne(r.backgroundCtx, id); err != nil { r.logger.Warn("background metadata retry failed", slog.String("thought_id", id.String()), slog.String("error", err.Error())) } }() @@ -196,7 +190,7 @@ func (r *MetadataRetryer) retryOne(ctx context.Context, id uuid.UUID) (bool, err return false, extractErr } - completedMetadata := metadata.MarkMetadataComplete(extracted, r.capture, attemptedAt) + completedMetadata := metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), r.capture, attemptedAt) completedMetadata.Attachments = thought.Metadata.Attachments if _, updateErr := r.store.UpdateThoughtMetadata(ctx, thought.ID, completedMetadata); updateErr != nil { return false, updateErr diff --git a/internal/tools/reparse_metadata.go b/internal/tools/reparse_metadata.go index e64ccc0..a7d5bb2 100644 --- a/internal/tools/reparse_metadata.go +++ b/internal/tools/reparse_metadata.go @@ -116,7 +116,7 @@ func (t *ReparseMetadataTool) Handle(ctx context.Context, req *mcp.CallToolReque mu.Unlock() t.logger.Warn("metadata reparse extract failed, using normalized existing metadata", slog.String("thought_id", thought.ID.String()), slog.String("error", extractErr.Error())) } else { - normalizedTarget = metadata.MarkMetadataComplete(extracted, t.capture, attemptedAt) + normalizedTarget = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, attemptedAt) normalizedTarget.Attachments = thought.Metadata.Attachments mu.Lock() out.Reparsed++ diff --git a/internal/tools/update.go b/internal/tools/update.go index 3e6c4aa..0f8a3b5 100644 --- a/internal/tools/update.go +++ b/internal/tools/update.go @@ -67,7 +67,7 @@ func (t *UpdateTool) Handle(ctx context.Context, _ *mcp.CallToolRequest, in Upda t.log.Warn("metadata extraction failed during update, keeping current metadata", slog.String("error", extractErr.Error())) mergedMetadata = metadata.MarkMetadataFailed(mergedMetadata, t.capture, time.Now().UTC(), extractErr) } else { - mergedMetadata = metadata.MarkMetadataComplete(extracted, t.capture, time.Now().UTC()) + mergedMetadata = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, time.Now().UTC()) mergedMetadata.Attachments = current.Metadata.Attachments } } diff --git a/llm/memory.md b/llm/memory.md index 2bcbc3a..957fbb0 100644 --- a/llm/memory.md +++ b/llm/memory.md @@ -24,9 +24,11 @@ Use AMCS as memory with two scopes: - Before substantial work, always retrieve context with `get_project_context` or `recall_context` so prior decisions inform your approach. - Save durable project facts with `capture_thought` after completing meaningful work. - Use `save_file` for project assets the memory should retain, such as screenshots, PDFs, audio notes, and other documents. +- If the goal is to retain the artifact itself, use `save_file` directly instead of first reading, transcribing, or summarizing the file contents. - Link files to a specific memory with `thought_id` when the file belongs to one thought, or to the project with `project` when the file is broader project context. - Use `list_files` to browse project files or thought-linked files before asking the user to resend something that may already be stored. - Use `load_file` when you need the actual stored file contents back. +- Stored files and attachment metadata must not be sent to the metadata extraction client. - Do not attach memory to the wrong project. ## Global Notebook Rules @@ -41,8 +43,9 @@ Use AMCS as memory with two scopes: - Do not save secrets, raw logs, or transient noise. - Prefer concise summaries. - Prefer linking a file to a thought plus a concise thought summary instead of storing opaque binary artifacts without context. +- Do not read a file just to make it storable; store the file directly and read it only when the file contents are needed for reasoning. - When saving, choose the narrowest correct scope: project if project-specific, global if not. ## Short Operational Form -Use AMCS memory in project scope when the current work matches a known project. If no clear project matches, global notebook memory is allowed for non-project-specific information. Store durable notes with `capture_thought`, store supporting binary artifacts with `save_file`, browse them with `list_files`, and load them with `load_file`. Never store project-specific memory globally when a matching project exists, and never store memory in the wrong project. If project matching is ambiguous, ask the user. +Use AMCS memory in project scope when the current work matches a known project. If no clear project matches, global notebook memory is allowed for non-project-specific information. Store durable notes with `capture_thought`, store supporting binary artifacts with `save_file`, prefer saving a file directly when the artifact itself is what matters, browse stored files with `list_files`, and load them with `load_file` only when their contents are needed. Never store project-specific memory globally when a matching project exists, and never store memory in the wrong project. If project matching is ambiguous, ask the user.