From 7de7516b979c8db7a614688722d08f77634e879b Mon Sep 17 00:00:00 2001
From: Tocchann <tosiyuki@hh.iij4u.or.jp>
Date: Wed, 29 Apr 2026 13:25:46 +0900
Subject: [PATCH] Fix ScanName() to correctly decode non-UTF-8 name objects
 (Shift-JIS, GBK, etc.)

Problem
-------
In Lexer.cs (Pdf.IO) and CLexer.cs (Pdf.Content), ScanName() had two bugs:

1. Non-ASCII byte detection used (byte & 0xC0) == 0xC0, which misses bytes in the
   range 0x80-0xBF.  Shift-JIS lead/continuation bytes such as 0x83 satisfy
   (0x83 & 0xC0) == 0x80, so the re-decode branch was never entered and the raw
   bytes were returned garbled.

2. The unconditional Encoding.UTF8.GetString() call silently replaced invalid bytes
   with U+FFFD instead of throwing, causing silent data loss for Shift-JIS, GBK,
   Big5, EUC-KR, and similar legacy encodings.

Fix
---
- Widen non-ASCII detection from (& 0xC0)==0xC0 to (& 0x80)!=0 so every byte >=
  0x80 triggers the re-decode path (covers UTF-8 lead bytes and all legacy encodings).
- Replace unconditional UTF-8 decode with a strict UTF-8 decoder (throwOnInvalidBytes:
  true) wrapped in try/catch; on DecoderFallbackException fall back to the ANSI code
  page of the current culture via PdfEncoders.AnsiCodepageEncoding.
- Add AnsiCodepageEncoding property to PdfEncoders that resolves the ANSI code page
  for the current culture (CP932 for ja-JP, CP936 for zh-CN, CP950 for zh-TW, etc.).
- Register CodePagesEncodingProvider once at startup on non-.NET-Framework targets
  so that Encoding.GetEncoding(codePage) resolves legacy code pages.
- Add System.Text.Encoding.CodePages NuGet reference for netstandard2.0 targets
  (net462 already includes all code pages natively).
- Add unit tests covering UTF-8 names, non-UTF-8 no-throw, and Shift-JIS fallback.

Fixes empira/PDFsharp#364
---
 src/Directory.Packages.props                  |  1 +
 .../src/PdfSharp/Pdf.Content/CLexer.cs        | 21 ++++--
 .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs | 21 ++++--
 .../src/PdfSharp/Pdf.Internal/PdfEncoders.cs  | 52 ++++++++++++---
 .../src/PDFsharp/src/PdfSharp/PdfSharp.csproj |  4 ++
 .../tests/PdfSharp.Tests/IO/LexerTests.cs     | 65 +++++++++++++++++++
 6 files changed, 144 insertions(+), 20 deletions(-)
diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index dcb63a2f..4360e29f 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -37,6 +37,7 @@
     <PackageVersion Include="Xunit.SkippableFact" Version="1.5.61" />
     <PackageVersion Include="XunitXml.TestLogger" Version="8.0.0" />
     <PackageVersion Include="FluentAssertions" Version="7.2.1" /> <!-- Latest 7.x version under Apache license. -->
+    <PackageVersion Include="System.Text.Encoding.CodePages" Version="10.0.5" />
     <!-- Other packages -->
     <PackageVersion Include="System.Resources.Extensions" Version="10.0.5" />
     <!-- Needed for PDFsharp-GDI. -->
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
index 85102697..84162976 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
@@ -234,20 +234,28 @@ static char LogError(char ch)
             }
 
             var name = Token;
-            // Check token for UTF-8 encoding.
+            // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
             for (int idx = 0; idx < name.Length; idx++)
             {
-                // If the two top most significant bits are set this identifies a 2, 3, or 4
-                // byte UTF-8 encoding sequence.
-                if ((name[idx] & 0xC0) == 0xC0)
+                if ((name[idx] & 0x80) != 0)
                 {
-                    // Special characters in Name objects use UTF-8 encoding.
+                    // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
                     var length = name.Length;
                     var bytes = new byte[length];
                     for (int idx2 = 0; idx2 < length; idx2++)
                         bytes[idx2] = (byte)name[idx2];
 
-                    var decodedName = Encoding.UTF8.GetString(bytes);
+                    string decodedName;
+                    try
+                    {
+                        // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
+                        decodedName = StrictUtf8.GetString(bytes);
+                    }
+                    catch (DecoderFallbackException)
+                    {
+                        // Fallback to ANSI code page encoding if UTF-8 decoding fails.
+                        decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
+                    }
                     _token.Clear();
                     _token.Append(decodedName);
                     break;
@@ -498,6 +506,7 @@ public CSymbol ScanNumber()
             return Symbol = CSymbol.Real; // CLexer returns "Real" because there is no "LongInteger".
         }
 
+        static readonly UTF8Encoding StrictUtf8 = new(false, true);
         static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];
 
         /// <summary>
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
index 738b49dd..d9954651 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -247,20 +247,28 @@ static char LogError(char ch)
             }
 
             var name = Token;
-            // Check for UTF-8 encoding.
+            // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
             for (int idx = 0; idx < name.Length; idx++)
             {
-                // If the two top most significant bits are set this identifies a 2, 3, or 4
-                // byte UTF-8 encoding sequence.
-                if ((name[idx] & 0xC0) == 0xC0)
+                if ((name[idx] & 0x80) != 0)
                 {
-                    // Special characters in Name objects use UTF-8 encoding.
+                    // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
                     var length = name.Length;
                     var bytes = new byte[length];
                     for (int idx2 = 0; idx2 < length; idx2++)
                         bytes[idx2] = (byte)name[idx2];
 
-                    var decodedName = Encoding.UTF8.GetString(bytes);
+                    string decodedName;
+                    try
+                    {
+                        // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
+                        decodedName = StrictUtf8.GetString(bytes);
+                    }
+                    catch (DecoderFallbackException)
+                    {
+                        // Fallback to ANSI code page encoding if UTF-8 decoding fails.
+                        decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
+                    }
                     _token.Clear();
                     _token.Append(decodedName);
                     break;
@@ -558,6 +566,7 @@ int TryReadReference()
             }
         }
 
+        static readonly UTF8Encoding StrictUtf8 = new(false, true);
         static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];
 
         /// <summary>
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
index 7a6245b4..7925d680 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
@@ -17,14 +17,36 @@ namespace PdfSharp.Pdf.Internal
     /// </summary>
     public static class PdfEncoders
     {
-        /// <summary>
-        /// Gets the PDFsharp specific encoder RawEncoding.
-        /// Ray encoding allows wo work with string instead of byte array.
-        /// A raw encoded string is equivalent to a byte array of the same length
-        /// where each sting character represents one byte.
-        /// Therefore, each character of a raw string has a value less than 256.
-        /// </summary>
-        public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
+        static PdfEncoders()
+        {
+#if !NETFRAMEWORK
+#if NETSTANDARD
+            // netstandard2.0 は .NET Framework 上でも動作しうるため、ランタイムで判定
+            // .NET Framework は、CodePagesEncodingProvider を必要としないため、コンポーネントがないため呼び出し自体を避ける
+            if (System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription.StartsWith(".NET Framework", StringComparison.OrdinalIgnoreCase))
+                return;
+#endif
+			RegisterCodePages();
+#endif
+        }
+#if !NETFRAMEWORK
+        private static void RegisterCodePages()
+        {
+            // Register CodePagesEncodingProvider so that legacy encodings like Shift-JIS (CP932),
+            // GBK (CP936), Big5 (CP950), EUC-KR (CP949), etc. are available on all platforms.
+            // On .NET Framework, all code pages are natively available and registration is not needed.
+            Encoding.RegisterProvider( CodePagesEncodingProvider.Instance );
+        }
+#endif
+
+		/// <summary>
+		/// Gets the PDFsharp specific encoder RawEncoding.
+		/// Ray encoding allows wo work with string instead of byte array.
+		/// A raw encoded string is equivalent to a byte array of the same length
+		/// where each sting character represents one byte.
+		/// Therefore, each character of a raw string has a value less than 256.
+		/// </summary>
+		public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
         static Encoding? _rawEncoding;
 
         internal static Encoding ByteStringEncoding => _rawEncoding ??= new RawEncoding();  // new name??
@@ -64,6 +86,20 @@ public static Encoding WinAnsiEncoding
         public static Encoding UnicodeEncoding => _unicodeEncoding ??= Encoding.Unicode;
         static Encoding? _unicodeEncoding;
 
+        /// <summary>
+        /// Gets an encoding that corresponds to the ANSI code page of the current culture.
+        /// </summary>
+        /// <remarks>The returned encoding is determined by the ANSI code page associated with the current
+        /// thread's culture. If the current culture does not define an ANSI code page, a default encoding such as
+        /// Latin1 may be used. This property is useful for interoperability with legacy systems or file formats that
+        /// rely on culture-specific encodings.</remarks>
+        public static Encoding AnsiCodepageEncoding =>
+#if NET5_0_OR_GREATER
+            CultureInfo.CurrentCulture.TextInfo.ANSICodePage != 0 ? Encoding.GetEncoding( CultureInfo.CurrentCulture.TextInfo.ANSICodePage ) : Encoding.Latin1;
+#else
+            Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage);
+#endif
+
         ///// <summary>
         ///// Encodes a string from a byte array. Each character gets the code of the corresponding byte.
         ///// </summary>
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
index f7ff7380..27a05eb7 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
@@ -70,6 +70,10 @@
     <InternalsVisibleTo Include="PdfSharp.Graphics.Pdf,       PublicKey=$(PDFsharpStronNamePublicKey)" />
   </ItemGroup>
 
+  <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
+    <PackageReference Include="System.Text.Encoding.CodePages" />
+  </ItemGroup>
+
   <ItemGroup>
     <Compile Include="..\..\..\shared\src\PdfSharp.Shared\dotnet\GetSubArray.cs" Link="Properties\GetSubArray(included).cs" />
     <Compile Include="..\..\..\shared\src\PdfSharp.System\Properties\FloatOrDouble.cs" Link="Properties\FloatOrDouble.cs" />
diff --git a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
index 690aa917..610ecdcd 100644
--- a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
+++ b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
@@ -258,6 +258,71 @@ public void Scan_ObjRef_Tests(string text, (int, int) objID/*, bool testAsLong,
 
         }
 
+        [Fact]
+        public void ScanName_UTF8_encoded_name_is_decoded_correctly()
+        {
+            // UTF-8 encoding of "日本語" is E6 97 A5 E6 9C AC E8 AA 9E.
+            // ScanName returns the token INCLUDING the leading '/', e.g. "/日本語".
+            var nameBytes = System.Text.Encoding.UTF8.GetBytes("日本語");
+            var bytes = new byte[1 + nameBytes.Length + 1];
+            bytes[0] = (byte)'/';
+            nameBytes.CopyTo(bytes, 1);
+            bytes[^1] = (byte)' ';
+
+            var lexer = CreateLexerFromBytes(bytes);
+            var symbol = lexer.ScanName();
+
+            symbol.Should().Be(Symbol.Name);
+            lexer.Token.Should().Be("/日本語");
+        }
+
+        [Fact]
+        public void ScanName_NonUTF8_bytes_do_not_throw()
+        {
+            // 0xE3 0x81 is an invalid UTF-8 sequence (incomplete 3-byte sequence).
+            // Regardless of platform, this must not throw and must return some string.
+            byte[] bytes = [(byte)'/', 0xE3, 0x81, (byte)' '];
+
+            var lexer = CreateLexerFromBytes(bytes);
+            var symbol = lexer.ScanName();
+
+            symbol.Should().Be(Symbol.Name);
+            lexer.Token.Should().NotBeNullOrEmpty();
+            lexer.Token.Should().StartWith("/");
+        }
+
+#if NET8_0_OR_GREATER
+        [SkippableFact]
+        public void ScanName_ShiftJIS_name_is_decoded_on_ShiftJIS_default_encoding()
+        {
+            // This test verifies Shift-JIS decoding when Encoding.Default is CP932.
+            // On non-Japanese environments, Encoding.Default may not be CP932 and
+            // the decoded string will differ, so we skip this test if CP932 is not the default.
+            Skip.If(System.Text.Encoding.Default.CodePage != 932,
+                "Requires Encoding.Default to use code page 932 (Shift-JIS).");
+
+            // Shift-JIS encoding of "日本語": 93 FA 96 D1 8C EA
+            // ScanName returns the token INCLUDING the leading '/'.
+            byte[] sjisBytes = [0x93, 0xFA, 0x96, 0xD1, 0x8C, 0xEA];
+            byte[] bytes = new byte[1 + sjisBytes.Length + 1];
+            bytes[0] = (byte)'/';
+            sjisBytes.CopyTo(bytes, 1);
+            bytes[^1] = (byte)' ';
+
+            var lexer = CreateLexerFromBytes(bytes);
+            var symbol = lexer.ScanName();
+
+            symbol.Should().Be(Symbol.Name);
+            lexer.Token.Should().Be("/日本語");
+        }
+#endif
+
+        static Lexer CreateLexerFromBytes(byte[] bytes)
+        {
+            var stream = new MemoryStream(bytes);
+            return new Lexer(stream, null);
+        }
+
         Lexer CreateLexer(string text)
         {
             var pdfString = new PdfString(text, PdfStringEncoding.RawEncoding);