From a3d5e27740593729a2efc681da72c529b040e6b3 Mon Sep 17 00:00:00 2001
From: orangeruan128 <orangeruan128@users.noreply.github.com>
Date: Mon, 20 Apr 2026 12:36:52 +1000
Subject: [PATCH] Tolerate truncated / partially-corrupt FSSHTTPB streams
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several read sites in FileNode.py read fixed-size structs (CompactID,
ObjectSpaceObjectStreamHeader, PropertySet header, FileNodeList stp seek)
without guarding against truncated input. On real-world OneNote files this
is hit by:

  * ValueError: cannot fit 'int' into an offset-sized integer
      — FileNodeList.__init__ calls file.seek(stp) where stp is the result
        of an unsigned read but the underlying file object treats it as a
        signed off_t.
  * struct.error: unpack requires a buffer of N bytes
      — ObjectSpaceObjectStreamOfIDs.body or its header, or PropertySet's
        cProperties, read past EOF.
  * KeyError on document._global_identification_table[...][guidIndex]
      — CompactID.__str__/__repr__ resolves a guidIndex that is the
        documented 0xFFFFFF "invalid" sentinel, or a cross-revision
        reference whose table is not populated yet.

Each of these previously aborted parsing of the entire document, even
when only one inner structure was malformed. This change keeps the happy
path identical and only adds local recovery:

  * FileNodeList: out-of-range stp → empty list, do not seek.
  * ObjectSpaceObjectStreamOfIDs: truncated header → synthetic empty
    header (Count=0, OsidStreamNotPresent=True); truncated body → break.
  * PropertySet: truncated cProperties → empty set (cProperties=0).
  * CompactID: missing guid table entry → '<unresolved guidIndex=0x...>'
    placeholder string instead of KeyError.

No spec-defined behavior changes for well-formed input.
---
 pyOneNote/FileNode.py | 55 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 10 deletions(-)
diff --git a/pyOneNote/FileNode.py b/pyOneNote/FileNode.py
index a5f04d0..49eea0c 100644
--- a/pyOneNote/FileNode.py
+++ b/pyOneNote/FileNode.py
@@ -13,7 +13,17 @@ def __init__(self, file):
 
 class FileNodeList:
     def __init__(self, file, document, file_chunk_reference):
-        file.seek(file_chunk_reference.stp)
+        # `stp` can be out-of-range (negative when interpreted signed, or > 2**63)
+        # on partially-corrupt or padded files. `file.seek` then raises:
+        #     ValueError: cannot fit 'int' into an offset-sized integer
+        # which would otherwise abort parsing of the entire document. Treat such a
+        # reference as an empty list so the surrounding tree can still be parsed.
+        try:
+            file.seek(file_chunk_reference.stp)
+        except (OverflowError, ValueError, OSError):
+            self.end = file_chunk_reference.stp
+            self.fragments = []
+            return
         self.end = file_chunk_reference.stp + file_chunk_reference.cb
         self.fragments = []
 
@@ -469,15 +479,21 @@ def __init__(self, file, document):
         self.document = document
         self.current_revision = self.document.cur_revision
 
+    def _resolve_guid(self):
+        try:
+            return self.document._global_identification_table[self.current_revision][self.guidIndex]
+        except KeyError:
+            # 0xFFFFFF (16777215) is the documented "invalid" sentinel; other misses
+            # can also occur on cross-revision references when the global identification
+            # table for the current revision was not (yet) fully populated. Returning a
+            # readable placeholder here keeps the rest of the document parseable.
+            return '<unresolved guidIndex=0x{:06x}>'.format(self.guidIndex)
+
     def __str__(self):
-        return '<ExtendedGUID> ({}, {})'.format(
-        self.document._global_identification_table[self.current_revision][self.guidIndex],
-        self.n)
+        return '<ExtendedGUID> ({}, {})'.format(self._resolve_guid(), self.n)
 
     def __repr__(self):
-        return '<ExtendedGUID> ({}, {})'.format(
-        self.document._global_identification_table[self.current_revision][self.guidIndex],
-        self.n)
+        return '<ExtendedGUID> ({}, {})'.format(self._resolve_guid(), self.n)
 
 
 class JCID:
@@ -569,11 +585,26 @@ def __init__(self, file, document):
 
 class ObjectSpaceObjectStreamOfIDs:
     def __init__(self, file, document):
-        self.header = ObjectSpaceObjectStreamHeader(file)
         self.body = []
         self.head = 0
+        try:
+            self.header = ObjectSpaceObjectStreamHeader(file)
+        except struct.error:
+            # Truncated stream at header read — synthesize an empty header so callers
+            # that check .header.OsidStreamNotPresent / .ExtendedStreamsPresent / .Count
+            # do not also need to special-case a missing attribute.
+            class _EmptyHeader:
+                Count = 0
+                ExtendedStreamsPresent = False
+                OsidStreamNotPresent = True
+            self.header = _EmptyHeader()
+            return
         for i in range(self.header.Count):
-            self.body.append(CompactID(file, document))
+            try:
+                self.body.append(CompactID(file, document))
+            except struct.error:
+                # Truncated mid-stream — stop reading and let the caller use what we have.
+                break
 
     def read(self):
         res = None
@@ -596,7 +627,11 @@ def __init__(self, file):
 class PropertySet:
     def __init__(self, file, OIDs, OSIDs, ContextIDs, document):
         self.current = file.tell()
-        self.cProperties, = struct.unpack('<H', file.read(2))
+        try:
+            self.cProperties, = struct.unpack('<H', file.read(2))
+        except struct.error:
+            # Truncated stream — treat as an empty property set.
+            self.cProperties = 0
         self.rgPrids = []
         self.indent = ''
         self.document = document