feat(metadata): enhance metadata handling by sanitizing extracted data and updating documentation for file storage
This commit is contained in:
@@ -136,7 +136,7 @@ Run `reparse_thought_metadata` to fix stale or inconsistent metadata by re-extra
|
|||||||
|
|
||||||
## File Storage
|
## File Storage
|
||||||
|
|
||||||
Use `save_file` to persist binary files as base64. Files can optionally be linked to a memory by passing `thought_id`, which also adds an attachment reference to that thought's metadata.
|
Use `save_file` to persist binary files as base64. Files can optionally be linked to a memory by passing `thought_id`, which also adds an attachment reference to that thought's metadata. AI clients should prefer `save_file` when the goal is to retain the artifact itself, rather than reading or summarizing the file first. Stored files and attachment metadata are not forwarded to the metadata extraction client.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -252,6 +252,10 @@ func (c *Client) ExtractMetadata(ctx context.Context, input string) (thoughttype
|
|||||||
err = fallbackErr
|
err = fallbackErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return thoughttypes.ThoughtMetadata{}, fmt.Errorf("%s metadata: %w", c.name, ctx.Err())
|
||||||
|
}
|
||||||
|
|
||||||
heuristic := heuristicMetadataFromInput(input)
|
heuristic := heuristicMetadataFromInput(input)
|
||||||
if c.log != nil {
|
if c.log != nil {
|
||||||
c.log.Warn("metadata extraction failed for all models, using heuristic fallback",
|
c.log.Warn("metadata extraction failed for all models, using heuristic fallback",
|
||||||
|
|||||||
@@ -110,6 +110,15 @@ func MarkMetadataComplete(base thoughttypes.ThoughtMetadata, capture config.Capt
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SanitizeExtracted(in thoughttypes.ThoughtMetadata) thoughttypes.ThoughtMetadata {
|
||||||
|
in.Attachments = nil
|
||||||
|
in.MetadataStatus = ""
|
||||||
|
in.MetadataUpdatedAt = ""
|
||||||
|
in.MetadataLastAttemptedAt = ""
|
||||||
|
in.MetadataError = ""
|
||||||
|
return in
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeList(values []string, limit int) []string {
|
func normalizeList(values []string, limit int) []string {
|
||||||
seen := make(map[string]struct{}, len(values))
|
seen := make(map[string]struct{}, len(values))
|
||||||
result := make([]string, 0, len(values))
|
result := make([]string, 0, len(values))
|
||||||
|
|||||||
@@ -107,6 +107,38 @@ func TestNormalizeDedupesAttachmentsByFileID(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSanitizeExtractedDropsAttachmentsAndMetadataControlFields(t *testing.T) {
|
||||||
|
id := uuid.New()
|
||||||
|
|
||||||
|
got := SanitizeExtracted(thoughttypes.ThoughtMetadata{
|
||||||
|
Type: "idea",
|
||||||
|
Attachments: []thoughttypes.ThoughtAttachment{{FileID: id, Name: "secret.pdf"}},
|
||||||
|
MetadataStatus: MetadataStatusFailed,
|
||||||
|
MetadataUpdatedAt: "2026-03-30T10:00:00Z",
|
||||||
|
MetadataLastAttemptedAt: "2026-03-30T10:01:00Z",
|
||||||
|
MetadataError: "boom",
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(got.Attachments) != 0 {
|
||||||
|
t.Fatalf("Attachments len = %d, want 0", len(got.Attachments))
|
||||||
|
}
|
||||||
|
if got.MetadataStatus != "" {
|
||||||
|
t.Fatalf("MetadataStatus = %q, want empty", got.MetadataStatus)
|
||||||
|
}
|
||||||
|
if got.MetadataUpdatedAt != "" {
|
||||||
|
t.Fatalf("MetadataUpdatedAt = %q, want empty", got.MetadataUpdatedAt)
|
||||||
|
}
|
||||||
|
if got.MetadataLastAttemptedAt != "" {
|
||||||
|
t.Fatalf("MetadataLastAttemptedAt = %q, want empty", got.MetadataLastAttemptedAt)
|
||||||
|
}
|
||||||
|
if got.MetadataError != "" {
|
||||||
|
t.Fatalf("MetadataError = %q, want empty", got.MetadataError)
|
||||||
|
}
|
||||||
|
if got.Type != "idea" {
|
||||||
|
t.Fatalf("Type = %q, want idea", got.Type)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestMarkMetadataPendingTracksAttemptWithoutClearingPreviousSuccess(t *testing.T) {
|
func TestMarkMetadataPendingTracksAttemptWithoutClearingPreviousSuccess(t *testing.T) {
|
||||||
attempt := time.Date(2026, 3, 30, 10, 0, 0, 0, time.UTC)
|
attempt := time.Date(2026, 3, 30, 10, 0, 0, 0, time.UTC)
|
||||||
base := thoughttypes.ThoughtMetadata{
|
base := thoughttypes.ThoughtMetadata{
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ func (t *CaptureTool) Handle(ctx context.Context, req *mcp.CallToolRequest, in C
|
|||||||
thought := thoughttypes.Thought{
|
thought := thoughttypes.Thought{
|
||||||
Content: content,
|
Content: content,
|
||||||
Embedding: embedding,
|
Embedding: embedding,
|
||||||
Metadata: metadata.Normalize(rawMetadata, t.capture),
|
Metadata: metadata.Normalize(metadata.SanitizeExtracted(rawMetadata), t.capture),
|
||||||
}
|
}
|
||||||
if project != nil {
|
if project != nil {
|
||||||
thought.ProjectID = &project.ID
|
thought.ProjectID = &project.ID
|
||||||
|
|||||||
@@ -243,11 +243,32 @@ func splitDataURL(value string) (contentBase64 string, mediaType string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func decodeBase64(value string) ([]byte, error) {
|
func decodeBase64(value string) ([]byte, error) {
|
||||||
decoded, err := base64.StdEncoding.DecodeString(value)
|
cleaned := strings.Map(func(r rune) rune {
|
||||||
|
switch r {
|
||||||
|
case ' ', '\t', '\n', '\r':
|
||||||
|
return -1
|
||||||
|
default:
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
}, value)
|
||||||
|
|
||||||
|
encodings := []*base64.Encoding{
|
||||||
|
base64.StdEncoding,
|
||||||
|
base64.RawStdEncoding,
|
||||||
|
base64.URLEncoding,
|
||||||
|
base64.RawURLEncoding,
|
||||||
|
}
|
||||||
|
|
||||||
|
var lastErr error
|
||||||
|
for _, encoding := range encodings {
|
||||||
|
decoded, err := encoding.DecodeString(cleaned)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return decoded, nil
|
return decoded, nil
|
||||||
}
|
}
|
||||||
return base64.RawStdEncoding.DecodeString(value)
|
lastErr = err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeMediaType(explicit string, fromDataURL string, content []byte) string {
|
func normalizeMediaType(explicit string, fromDataURL string, content []byte) string {
|
||||||
|
|||||||
28
internal/tools/files_test.go
Normal file
28
internal/tools/files_test.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
package tools
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestDecodeBase64AcceptsWhitespaceAndMultipleVariants(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{name: "standard with whitespace", input: "aG V s\nbG8=", want: "hello"},
|
||||||
|
{name: "raw standard", input: "aGVsbG8", want: "hello"},
|
||||||
|
{name: "standard url-safe payload", input: "--8=", want: string([]byte{0xfb, 0xef})},
|
||||||
|
{name: "raw url-safe payload", input: "--8", want: string([]byte{0xfb, 0xef})},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got, err := decodeBase64(tc.input)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("decodeBase64(%q) error = %v", tc.input, err)
|
||||||
|
}
|
||||||
|
if string(got) != tc.want {
|
||||||
|
t.Fatalf("decodeBase64(%q) = %q, want %q", tc.input, string(got), tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -82,13 +82,7 @@ func (t *RetryMetadataTool) Handle(ctx context.Context, req *mcp.CallToolRequest
|
|||||||
|
|
||||||
func (r *MetadataRetryer) QueueThought(id uuid.UUID) {
|
func (r *MetadataRetryer) QueueThought(id uuid.UUID) {
|
||||||
go func() {
|
go func() {
|
||||||
attemptCtx := r.backgroundCtx
|
if _, err := r.retryOne(r.backgroundCtx, id); err != nil {
|
||||||
if r.metadataTimeout > 0 {
|
|
||||||
var cancel context.CancelFunc
|
|
||||||
attemptCtx, cancel = context.WithTimeout(r.backgroundCtx, r.metadataTimeout)
|
|
||||||
defer cancel()
|
|
||||||
}
|
|
||||||
if _, err := r.retryOne(attemptCtx, id); err != nil {
|
|
||||||
r.logger.Warn("background metadata retry failed", slog.String("thought_id", id.String()), slog.String("error", err.Error()))
|
r.logger.Warn("background metadata retry failed", slog.String("thought_id", id.String()), slog.String("error", err.Error()))
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -196,7 +190,7 @@ func (r *MetadataRetryer) retryOne(ctx context.Context, id uuid.UUID) (bool, err
|
|||||||
return false, extractErr
|
return false, extractErr
|
||||||
}
|
}
|
||||||
|
|
||||||
completedMetadata := metadata.MarkMetadataComplete(extracted, r.capture, attemptedAt)
|
completedMetadata := metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), r.capture, attemptedAt)
|
||||||
completedMetadata.Attachments = thought.Metadata.Attachments
|
completedMetadata.Attachments = thought.Metadata.Attachments
|
||||||
if _, updateErr := r.store.UpdateThoughtMetadata(ctx, thought.ID, completedMetadata); updateErr != nil {
|
if _, updateErr := r.store.UpdateThoughtMetadata(ctx, thought.ID, completedMetadata); updateErr != nil {
|
||||||
return false, updateErr
|
return false, updateErr
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ func (t *ReparseMetadataTool) Handle(ctx context.Context, req *mcp.CallToolReque
|
|||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
t.logger.Warn("metadata reparse extract failed, using normalized existing metadata", slog.String("thought_id", thought.ID.String()), slog.String("error", extractErr.Error()))
|
t.logger.Warn("metadata reparse extract failed, using normalized existing metadata", slog.String("thought_id", thought.ID.String()), slog.String("error", extractErr.Error()))
|
||||||
} else {
|
} else {
|
||||||
normalizedTarget = metadata.MarkMetadataComplete(extracted, t.capture, attemptedAt)
|
normalizedTarget = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, attemptedAt)
|
||||||
normalizedTarget.Attachments = thought.Metadata.Attachments
|
normalizedTarget.Attachments = thought.Metadata.Attachments
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
out.Reparsed++
|
out.Reparsed++
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ func (t *UpdateTool) Handle(ctx context.Context, _ *mcp.CallToolRequest, in Upda
|
|||||||
t.log.Warn("metadata extraction failed during update, keeping current metadata", slog.String("error", extractErr.Error()))
|
t.log.Warn("metadata extraction failed during update, keeping current metadata", slog.String("error", extractErr.Error()))
|
||||||
mergedMetadata = metadata.MarkMetadataFailed(mergedMetadata, t.capture, time.Now().UTC(), extractErr)
|
mergedMetadata = metadata.MarkMetadataFailed(mergedMetadata, t.capture, time.Now().UTC(), extractErr)
|
||||||
} else {
|
} else {
|
||||||
mergedMetadata = metadata.MarkMetadataComplete(extracted, t.capture, time.Now().UTC())
|
mergedMetadata = metadata.MarkMetadataComplete(metadata.SanitizeExtracted(extracted), t.capture, time.Now().UTC())
|
||||||
mergedMetadata.Attachments = current.Metadata.Attachments
|
mergedMetadata.Attachments = current.Metadata.Attachments
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,9 +24,11 @@ Use AMCS as memory with two scopes:
|
|||||||
- Before substantial work, always retrieve context with `get_project_context` or `recall_context` so prior decisions inform your approach.
|
- Before substantial work, always retrieve context with `get_project_context` or `recall_context` so prior decisions inform your approach.
|
||||||
- Save durable project facts with `capture_thought` after completing meaningful work.
|
- Save durable project facts with `capture_thought` after completing meaningful work.
|
||||||
- Use `save_file` for project assets the memory should retain, such as screenshots, PDFs, audio notes, and other documents.
|
- Use `save_file` for project assets the memory should retain, such as screenshots, PDFs, audio notes, and other documents.
|
||||||
|
- If the goal is to retain the artifact itself, use `save_file` directly instead of first reading, transcribing, or summarizing the file contents.
|
||||||
- Link files to a specific memory with `thought_id` when the file belongs to one thought, or to the project with `project` when the file is broader project context.
|
- Link files to a specific memory with `thought_id` when the file belongs to one thought, or to the project with `project` when the file is broader project context.
|
||||||
- Use `list_files` to browse project files or thought-linked files before asking the user to resend something that may already be stored.
|
- Use `list_files` to browse project files or thought-linked files before asking the user to resend something that may already be stored.
|
||||||
- Use `load_file` when you need the actual stored file contents back.
|
- Use `load_file` when you need the actual stored file contents back.
|
||||||
|
- Stored files and attachment metadata must not be sent to the metadata extraction client.
|
||||||
- Do not attach memory to the wrong project.
|
- Do not attach memory to the wrong project.
|
||||||
|
|
||||||
## Global Notebook Rules
|
## Global Notebook Rules
|
||||||
@@ -41,8 +43,9 @@ Use AMCS as memory with two scopes:
|
|||||||
- Do not save secrets, raw logs, or transient noise.
|
- Do not save secrets, raw logs, or transient noise.
|
||||||
- Prefer concise summaries.
|
- Prefer concise summaries.
|
||||||
- Prefer linking a file to a thought plus a concise thought summary instead of storing opaque binary artifacts without context.
|
- Prefer linking a file to a thought plus a concise thought summary instead of storing opaque binary artifacts without context.
|
||||||
|
- Do not read a file just to make it storable; store the file directly and read it only when the file contents are needed for reasoning.
|
||||||
- When saving, choose the narrowest correct scope: project if project-specific, global if not.
|
- When saving, choose the narrowest correct scope: project if project-specific, global if not.
|
||||||
|
|
||||||
## Short Operational Form
|
## Short Operational Form
|
||||||
|
|
||||||
Use AMCS memory in project scope when the current work matches a known project. If no clear project matches, global notebook memory is allowed for non-project-specific information. Store durable notes with `capture_thought`, store supporting binary artifacts with `save_file`, browse them with `list_files`, and load them with `load_file`. Never store project-specific memory globally when a matching project exists, and never store memory in the wrong project. If project matching is ambiguous, ask the user.
|
Use AMCS memory in project scope when the current work matches a known project. If no clear project matches, global notebook memory is allowed for non-project-specific information. Store durable notes with `capture_thought`, store supporting binary artifacts with `save_file`, prefer saving a file directly when the artifact itself is what matters, browse stored files with `list_files`, and load them with `load_file` only when their contents are needed. Never store project-specific memory globally when a matching project exists, and never store memory in the wrong project. If project matching is ambiguous, ask the user.
|
||||||
|
|||||||
Reference in New Issue
Block a user