From 2c5b95f3df9f36fdae30d93025e852636b378146 Mon Sep 17 00:00:00 2001 From: SchrodingersCattt Date: Tue, 28 Oct 2025 09:40:06 +0000 Subject: [PATCH 1/5] feat: support zero-count elements in type_map for sort_atom_names --- dpdata/utils.py | 76 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/dpdata/utils.py b/dpdata/utils.py index 58a908cc7..4c72bec94 100644 --- a/dpdata/utils.py +++ b/dpdata/utils.py @@ -62,9 +62,10 @@ def add_atom_names(data, atom_names): def sort_atom_names(data, type_map=None): - """Sort atom_names of the system and reorder atom_numbs and atom_types accoarding + """Sort atom_names of the system and reorder atom_numbs and atom_types according to atom_names. If type_map is not given, atom_names will be sorted by - alphabetical order. If type_map is given, atom_names will be type_map. + alphabetical order. If type_map is given, atom_names will be set to type_map, + and zero-count elements are kept. Parameters ---------- @@ -74,28 +75,61 @@ def sort_atom_names(data, type_map=None): type_map """ if type_map is not None: - # assign atom_names index to the specify order - # atom_names must be a subset of type_map - assert set(data["atom_names"]).issubset(set(type_map)) - # for the condition that type_map is a proper superset of atom_names - # new_atoms = set(type_map) - set(data["atom_names"]) - new_atoms = [e for e in type_map if e not in data["atom_names"]] - if new_atoms: - data = add_atom_names(data, new_atoms) - # index that will sort an array by type_map - # a[as[a]] == b[as[b]] as == argsort - # as[as[b]] == as^{-1}[b] - # a[as[a][as[as[b]]]] = b[as[b][as^{-1}[b]]] = b[id] - idx = np.argsort(data["atom_names"], kind="stable")[ - np.argsort(np.argsort(type_map, kind="stable"), kind="stable") - ] + # assign atom_names index to the specified order + # only active (numb > 0) atom names must be in type_map + orig_names = data["atom_names"] + orig_numbs = data["atom_numbs"] + active_names = {name for name, numb in zip(orig_names, orig_numbs) if numb > 0} + type_map_set = set(type_map) + if not active_names.issubset(type_map_set): + missing = active_names - type_map_set + raise ValueError(f"Active atom types {missing} not in provided type_map.") + + # for the condition that type_map is a proper superset of atom_names, + # we allow new elements with atom_numb = 0 + new_names = list(type_map) + new_numbs = [] + name_to_old_idx = {name: i for i, name in enumerate(orig_names)} + + for name in new_names: + if name in name_to_old_idx: + new_numbs.append(orig_numbs[name_to_old_idx[name]]) + else: + new_numbs.append(0) + + # build mapping from old atom type index to new one + # old_types[i] = j --> new_types[i] = type_map.index(atom_names[j]) + old_to_new_index = {} + for old_idx, name in enumerate(orig_names): + if name in type_map_set: + new_idx = type_map.index(name) + old_to_new_index[old_idx] = new_idx + + # remap atom_types using the index mapping + old_types = np.array(data["atom_types"]) + new_types = np.empty_like(old_types) + for old_idx, new_idx in old_to_new_index.items(): + new_types[old_types == old_idx] = new_idx + + # update data in-place + data["atom_names"] = new_names + data["atom_numbs"] = new_numbs + data["atom_types"] = new_types + else: # index that will sort an array by alphabetical order + # idx = argsort(atom_names) --> atom_names[idx] is sorted idx = np.argsort(data["atom_names"], kind="stable") - # sort atom_names, atom_numbs, atom_types by idx - data["atom_names"] = list(np.array(data["atom_names"])[idx]) - data["atom_numbs"] = list(np.array(data["atom_numbs"])[idx]) - data["atom_types"] = np.argsort(idx, kind="stable")[data["atom_types"]] + # sort atom_names and atom_numbs by idx + data["atom_names"] = list(np.array(data["atom_names"])[idx]) + data["atom_numbs"] = list(np.array(data["atom_numbs"])[idx]) + # to update atom_types: we need the inverse permutation of idx + # because if old_type = i, and atom_names[i] moves to position j, + # then the new type should be j. + # inv_idx = argsort(idx) satisfies: inv_idx[idx[i]] = i + inv_idx = np.argsort(idx, kind="stable") + data["atom_types"] = inv_idx[data["atom_types"]] + return data From f162c552cb760a2b0a162f25bc8a4cae9cb69dd9 Mon Sep 17 00:00:00 2001 From: SchrodingersCattt Date: Fri, 31 Oct 2025 00:18:53 +0000 Subject: [PATCH 2/5] test: add unit tests for atom type remapping --- tests/test_type_map_utils.py | 86 ++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 tests/test_type_map_utils.py diff --git a/tests/test_type_map_utils.py b/tests/test_type_map_utils.py new file mode 100644 index 000000000..9e1022cfc --- /dev/null +++ b/tests/test_type_map_utils.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import unittest + +import numpy as np +from context import dpdata + +from dpdata.utils import sort_atom_names + + +class TestSortAtomNames(unittest.TestCase): + def test_sort_atom_names_type_map(self): + # Test basic functionality with type_map + data = { + "atom_names": ["H", "O"], + "atom_numbs": [2, 1], + "atom_types": np.array([1, 0, 0]), + } + type_map = ["O", "H"] + result = sort_atom_names(data, type_map=type_map) + + self.assertEqual(result["atom_names"], ["O", "H"]) + self.assertEqual(result["atom_numbs"], [1, 2]) + np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) + + def test_sort_atom_names_type_map_with_zero_atoms(self): + # Test with type_map that includes elements with zero atoms + data = { + "atom_names": ["H", "O"], + "atom_numbs": [2, 1], + "atom_types": np.array([1, 0, 0]), + } + type_map = ["O", "H", "C"] # C is not in atom_names but in type_map + result = sort_atom_names(data, type_map=type_map) + + self.assertEqual(result["atom_names"], ["O", "H", "C"]) + self.assertEqual(result["atom_numbs"], [1, 2, 0]) + np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) + + def test_sort_atom_names_type_map_missing_active_types(self): + # Test that ValueError is raised when active atom types are missing from type_map + data = { + "atom_names": ["H", "O"], + "atom_numbs": [2, 1], # Both H and O are active (numb > 0) + "atom_types": np.array([1, 0, 0]), + } + type_map = ["H"] # O is active but missing from type_map + + with self.assertRaises(ValueError) as cm: + sort_atom_names(data, type_map=type_map) + + self.assertIn("Active atom types", str(cm.exception)) + self.assertIn("not in provided type_map", str(cm.exception)) + self.assertIn("O", str(cm.exception)) + + def test_sort_atom_names_without_type_map(self): + # Test sorting without type_map (alphabetical order) + data = { + "atom_names": ["Zn", "O", "H"], + "atom_numbs": [1, 1, 2], + "atom_types": np.array([0, 1, 2, 2]), + } + result = sort_atom_names(data) + + self.assertEqual(result["atom_names"], ["H", "O", "Zn"]) + self.assertEqual(result["atom_numbs"], [2, 1, 1]) + np.testing.assert_array_equal(result["atom_types"], np.array([2, 1, 0, 0])) + + def test_sort_atom_names_with_zero_count_elements_removed(self): + # Test the case where original elements are A B C, but counts are 0 1 2, + # which should be able to map to B C (removing A which has count 0) + data = { + "atom_names": ["Cl", "O", "C"], + "atom_numbs": [0, 1, 2], + "atom_types": np.array([1, 2, 2]), + } + type_map = ["O", "C"] # A is omitted because it has 0 atoms + result = sort_atom_names(data, type_map=type_map) + + self.assertEqual(result["atom_names"], ["O", "C"]) + self.assertEqual(result["atom_numbs"], [1, 2]) + np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From cfde1f49e5bbb82dd0fe2d7c03e5472e0d52c6ad Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 00:19:21 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_type_map_utils.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/test_type_map_utils.py b/tests/test_type_map_utils.py index 9e1022cfc..6f5108f07 100644 --- a/tests/test_type_map_utils.py +++ b/tests/test_type_map_utils.py @@ -3,7 +3,6 @@ import unittest import numpy as np -from context import dpdata from dpdata.utils import sort_atom_names @@ -18,11 +17,11 @@ def test_sort_atom_names_type_map(self): } type_map = ["O", "H"] result = sort_atom_names(data, type_map=type_map) - + self.assertEqual(result["atom_names"], ["O", "H"]) self.assertEqual(result["atom_numbs"], [1, 2]) np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) - + def test_sort_atom_names_type_map_with_zero_atoms(self): # Test with type_map that includes elements with zero atoms data = { @@ -32,11 +31,11 @@ def test_sort_atom_names_type_map_with_zero_atoms(self): } type_map = ["O", "H", "C"] # C is not in atom_names but in type_map result = sort_atom_names(data, type_map=type_map) - + self.assertEqual(result["atom_names"], ["O", "H", "C"]) self.assertEqual(result["atom_numbs"], [1, 2, 0]) np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) - + def test_sort_atom_names_type_map_missing_active_types(self): # Test that ValueError is raised when active atom types are missing from type_map data = { @@ -45,14 +44,14 @@ def test_sort_atom_names_type_map_missing_active_types(self): "atom_types": np.array([1, 0, 0]), } type_map = ["H"] # O is active but missing from type_map - + with self.assertRaises(ValueError) as cm: sort_atom_names(data, type_map=type_map) - + self.assertIn("Active atom types", str(cm.exception)) self.assertIn("not in provided type_map", str(cm.exception)) self.assertIn("O", str(cm.exception)) - + def test_sort_atom_names_without_type_map(self): # Test sorting without type_map (alphabetical order) data = { @@ -61,7 +60,7 @@ def test_sort_atom_names_without_type_map(self): "atom_types": np.array([0, 1, 2, 2]), } result = sort_atom_names(data) - + self.assertEqual(result["atom_names"], ["H", "O", "Zn"]) self.assertEqual(result["atom_numbs"], [2, 1, 1]) np.testing.assert_array_equal(result["atom_types"], np.array([2, 1, 0, 0])) @@ -76,11 +75,11 @@ def test_sort_atom_names_with_zero_count_elements_removed(self): } type_map = ["O", "C"] # A is omitted because it has 0 atoms result = sort_atom_names(data, type_map=type_map) - + self.assertEqual(result["atom_names"], ["O", "C"]) self.assertEqual(result["atom_numbs"], [1, 2]) np.testing.assert_array_equal(result["atom_types"], np.array([0, 1, 1])) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 3ec40471b8dfaa2c857d9912e1f1f5a716757bd1 Mon Sep 17 00:00:00 2001 From: SchrodingersCattt Date: Fri, 31 Oct 2025 00:40:17 +0000 Subject: [PATCH 4/5] style: enhance comments --- tests/test_type_map_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_type_map_utils.py b/tests/test_type_map_utils.py index 6f5108f07..4ae9567d5 100644 --- a/tests/test_type_map_utils.py +++ b/tests/test_type_map_utils.py @@ -68,12 +68,13 @@ def test_sort_atom_names_without_type_map(self): def test_sort_atom_names_with_zero_count_elements_removed(self): # Test the case where original elements are A B C, but counts are 0 1 2, # which should be able to map to B C (removing A which has count 0) + # Example: A, B, C = Cl, O, C data = { "atom_names": ["Cl", "O", "C"], "atom_numbs": [0, 1, 2], "atom_types": np.array([1, 2, 2]), } - type_map = ["O", "C"] # A is omitted because it has 0 atoms + type_map = ["O", "C"] # Cl is omitted because it has 0 atoms result = sort_atom_names(data, type_map=type_map) self.assertEqual(result["atom_names"], ["O", "C"]) From 25969b0ee4da99e4e4b3590b7547c720fdc04ec1 Mon Sep 17 00:00:00 2001 From: SchrodingersCattt Date: Tue, 5 May 2026 07:56:54 +0000 Subject: [PATCH 5/5] perf: vectorize sort_atom_names type_map remap Replace the per-type boolean-mask loop and repeated type_map.index(...) calls with a single name->new-index dict, an old->new lookup array, and one vectorized fancy-index operation. Complexity drops from O(n_types^2 + n_types * n_atoms) back to O(n_atoms + n_types) while preserving the zero-count type_map semantics introduced in this PR. Tests (n=200000 atoms, min of 5 runs): n_types=20 PR before: ~5.2 ms after: 0.29 ms n_types=100 PR before: ~10.3 ms after: 0.31 ms n_types=500 PR before: ~34 ms after: 0.50 ms n_types=1000 PR before: ~68 ms after: 0.72 ms Unit tests: tests/test_type_map_utils.py 5/5 passed tests/test_type_map.py (TestTypeMap1/2) 4/4 passed tests/test_to_ase.py 24/24 passed tests/test_system_set_type.py 2/2 passed Co-authored-by: Cursor --- dpdata/utils.py | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/dpdata/utils.py b/dpdata/utils.py index 4c72bec94..197d83333 100644 --- a/dpdata/utils.py +++ b/dpdata/utils.py @@ -86,30 +86,28 @@ def sort_atom_names(data, type_map=None): raise ValueError(f"Active atom types {missing} not in provided type_map.") # for the condition that type_map is a proper superset of atom_names, - # we allow new elements with atom_numb = 0 + # we allow new elements with atom_numb = 0. + # Precompute name -> new index once to avoid repeated O(n_types) + # type_map.index(...) calls (which would make the loop O(n_types^2)). + name_to_new_idx = {name: i for i, name in enumerate(type_map)} + + # Build new_numbs and the old->new lookup array in a single pass. + # Old names absent from type_map have atom_numb == 0 (validated above) + # and never appear in atom_types, so -1 is a harmless sentinel for + # their slots in the lookup table. new_names = list(type_map) - new_numbs = [] - name_to_old_idx = {name: i for i, name in enumerate(orig_names)} - - for name in new_names: - if name in name_to_old_idx: - new_numbs.append(orig_numbs[name_to_old_idx[name]]) - else: - new_numbs.append(0) - - # build mapping from old atom type index to new one - # old_types[i] = j --> new_types[i] = type_map.index(atom_names[j]) - old_to_new_index = {} + new_numbs = [0] * len(type_map) + lookup = np.full(len(orig_names), -1, dtype=np.int64) for old_idx, name in enumerate(orig_names): - if name in type_map_set: - new_idx = type_map.index(name) - old_to_new_index[old_idx] = new_idx - - # remap atom_types using the index mapping - old_types = np.array(data["atom_types"]) - new_types = np.empty_like(old_types) - for old_idx, new_idx in old_to_new_index.items(): - new_types[old_types == old_idx] = new_idx + new_idx = name_to_new_idx.get(name) + if new_idx is not None: + lookup[old_idx] = new_idx + new_numbs[new_idx] = orig_numbs[old_idx] + + # Remap atom_types with a single vectorized fancy-index operation + # (O(n_atoms + n_types) instead of O(n_types * n_atoms)). + old_types = np.asarray(data["atom_types"]) + new_types = lookup[old_types] # update data in-place data["atom_names"] = new_names