From a3d5e27740593729a2efc681da72c529b040e6b3 Mon Sep 17 00:00:00 2001 From: orangeruan128 Date: Mon, 20 Apr 2026 12:36:52 +1000 Subject: [PATCH] Tolerate truncated / partially-corrupt FSSHTTPB streams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several read sites in FileNode.py read fixed-size structs (CompactID, ObjectSpaceObjectStreamHeader, PropertySet header, FileNodeList stp seek) without guarding against truncated input. On real-world OneNote files this is hit by: * ValueError: cannot fit 'int' into an offset-sized integer — FileNodeList.__init__ calls file.seek(stp) where stp is the result of an unsigned read but the underlying file object treats it as a signed off_t. * struct.error: unpack requires a buffer of N bytes — ObjectSpaceObjectStreamOfIDs.body or its header, or PropertySet's cProperties, read past EOF. * KeyError on document._global_identification_table[...][guidIndex] — CompactID.__str__/__repr__ resolves a guidIndex that is the documented 0xFFFFFF "invalid" sentinel, or a cross-revision reference whose table is not populated yet. Each of these previously aborted parsing of the entire document, even when only one inner structure was malformed. This change keeps the happy path identical and only adds local recovery: * FileNodeList: out-of-range stp → empty list, do not seek. * ObjectSpaceObjectStreamOfIDs: truncated header → synthetic empty header (Count=0, OsidStreamNotPresent=True); truncated body → break. * PropertySet: truncated cProperties → empty set (cProperties=0). * CompactID: missing guid table entry → '' placeholder string instead of KeyError. No spec-defined behavior changes for well-formed input. --- pyOneNote/FileNode.py | 55 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/pyOneNote/FileNode.py b/pyOneNote/FileNode.py index a5f04d0..49eea0c 100644 --- a/pyOneNote/FileNode.py +++ b/pyOneNote/FileNode.py @@ -13,7 +13,17 @@ def __init__(self, file): class FileNodeList: def __init__(self, file, document, file_chunk_reference): - file.seek(file_chunk_reference.stp) + # `stp` can be out-of-range (negative when interpreted signed, or > 2**63) + # on partially-corrupt or padded files. `file.seek` then raises: + # ValueError: cannot fit 'int' into an offset-sized integer + # which would otherwise abort parsing of the entire document. Treat such a + # reference as an empty list so the surrounding tree can still be parsed. + try: + file.seek(file_chunk_reference.stp) + except (OverflowError, ValueError, OSError): + self.end = file_chunk_reference.stp + self.fragments = [] + return self.end = file_chunk_reference.stp + file_chunk_reference.cb self.fragments = [] @@ -469,15 +479,21 @@ def __init__(self, file, document): self.document = document self.current_revision = self.document.cur_revision + def _resolve_guid(self): + try: + return self.document._global_identification_table[self.current_revision][self.guidIndex] + except KeyError: + # 0xFFFFFF (16777215) is the documented "invalid" sentinel; other misses + # can also occur on cross-revision references when the global identification + # table for the current revision was not (yet) fully populated. Returning a + # readable placeholder here keeps the rest of the document parseable. + return ''.format(self.guidIndex) + def __str__(self): - return ' ({}, {})'.format( - self.document._global_identification_table[self.current_revision][self.guidIndex], - self.n) + return ' ({}, {})'.format(self._resolve_guid(), self.n) def __repr__(self): - return ' ({}, {})'.format( - self.document._global_identification_table[self.current_revision][self.guidIndex], - self.n) + return ' ({}, {})'.format(self._resolve_guid(), self.n) class JCID: @@ -569,11 +585,26 @@ def __init__(self, file, document): class ObjectSpaceObjectStreamOfIDs: def __init__(self, file, document): - self.header = ObjectSpaceObjectStreamHeader(file) self.body = [] self.head = 0 + try: + self.header = ObjectSpaceObjectStreamHeader(file) + except struct.error: + # Truncated stream at header read — synthesize an empty header so callers + # that check .header.OsidStreamNotPresent / .ExtendedStreamsPresent / .Count + # do not also need to special-case a missing attribute. + class _EmptyHeader: + Count = 0 + ExtendedStreamsPresent = False + OsidStreamNotPresent = True + self.header = _EmptyHeader() + return for i in range(self.header.Count): - self.body.append(CompactID(file, document)) + try: + self.body.append(CompactID(file, document)) + except struct.error: + # Truncated mid-stream — stop reading and let the caller use what we have. + break def read(self): res = None @@ -596,7 +627,11 @@ def __init__(self, file): class PropertySet: def __init__(self, file, OIDs, OSIDs, ContextIDs, document): self.current = file.tell() - self.cProperties, = struct.unpack('