From fbe87e5ebd4fca98700ab8d44d8a4a126daad632 Mon Sep 17 00:00:00 2001 From: Ryota Yoda Date: Tue, 28 Apr 2026 09:53:14 +0900 Subject: [PATCH] fix: use Unicode-aware keyword extraction in InMemoryMemoryService Replace [A-Za-z]+ with \w+ so token extraction includes Unicode word characters. Add a non-ASCII containment fallback in search_memory() for scripts without whitespace word boundaries (Japanese, Chinese). Fixes #5501 --- .../adk/memory/in_memory_memory_service.py | 16 ++++-- .../memory/test_in_memory_memory_service.py | 50 +++++++++++++++++++ 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/src/google/adk/memory/in_memory_memory_service.py b/src/google/adk/memory/in_memory_memory_service.py index 02276598cb..af4b4c26d9 100644 --- a/src/google/adk/memory/in_memory_memory_service.py +++ b/src/google/adk/memory/in_memory_memory_service.py @@ -38,8 +38,8 @@ def _user_key(app_name: str, user_id: str) -> str: def _extract_words_lower(text: str) -> set[str]: - """Extracts words from a string and converts them to lowercase.""" - return set([word.lower() for word in re.findall(r'[A-Za-z]+', text)]) + """Extracts Unicode-aware tokens from a string in lowercase.""" + return set(word.lower() for word in re.findall(r'\w+', text)) class InMemoryMemoryService(BaseMemoryService): @@ -116,13 +116,19 @@ async def search_memory( for event in session_events: if not event.content or not event.content.parts: continue - words_in_event = _extract_words_lower( - ' '.join([part.text for part in event.content.parts if part.text]) + event_text = ' '.join( + [part.text for part in event.content.parts if part.text] ) + words_in_event = _extract_words_lower(event_text) if not words_in_event: continue - if any(query_word in words_in_event for query_word in words_in_query): + event_text_lower = event_text.lower() + if any( + query_word in words_in_event + or (not query_word.isascii() and query_word in event_text_lower) + for query_word in words_in_query + ): response.memories.append( MemoryEntry( content=event.content, diff --git a/tests/unittests/memory/test_in_memory_memory_service.py b/tests/unittests/memory/test_in_memory_memory_service.py index d50692f0bc..6c590cddec 100644 --- a/tests/unittests/memory/test_in_memory_memory_service.py +++ b/tests/unittests/memory/test_in_memory_memory_service.py @@ -327,3 +327,53 @@ async def test_search_memory_is_scoped_by_user(): assert ( result_other_user.memories[0].content.parts[0].text == 'This is a secret.' ) + + +# --- Non-Latin language tests --- + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + 'event_text,query,expected_count', + [ + # Japanese (no space delimiters — substring fallback) + ('私の名前は太郎です', '太郎', 1), + ('私の名前は太郎です', '天気', 0), + # Chinese (no space delimiters — substring fallback) + ('我喜欢机器学习', '机器学习', 1), + ('我喜欢机器学习', '天气预报', 0), + # Korean (space-delimited — token match) + ('제 이름은 민수입니다', '민수입니다', 1), + # Cyrillic (space-delimited — token match) + ('Меня зовут Алексей', 'Алексей', 1), + # Mixed: non-Latin substring + Latin token in same event + ('太郎 works at ABC Corp', '太郎', 1), + ('太郎 works at ABC Corp', 'ABC', 1), + # Latin partial-word must NOT match (regression guard) + ('I like to code in Python.', 'thon', 0), + ], +) +async def test_search_memory_non_latin(event_text, query, expected_count): + """Tests search_memory with non-Latin scripts and mixed content.""" + session = Session( + app_name=MOCK_APP_NAME, + user_id=MOCK_USER_ID, + id='session-i18n', + last_update_time=7000, + events=[ + Event( + id='event-i18n', + invocation_id='inv-i18n', + author='user', + timestamp=90000, + content=types.Content(parts=[types.Part(text=event_text)]), + ), + ], + ) + memory_service = InMemoryMemoryService() + await memory_service.add_session_to_memory(session) + + result = await memory_service.search_memory( + app_name=MOCK_APP_NAME, user_id=MOCK_USER_ID, query=query + ) + assert len(result.memories) == expected_count