From 7de7516b979c8db7a614688722d08f77634e879b Mon Sep 17 00:00:00 2001 From: Tocchann Date: Wed, 29 Apr 2026 13:25:46 +0900 Subject: [PATCH] Fix ScanName() to correctly decode non-UTF-8 name objects (Shift-JIS, GBK, etc.) Problem ------- In Lexer.cs (Pdf.IO) and CLexer.cs (Pdf.Content), ScanName() had two bugs: 1. Non-ASCII byte detection used (byte & 0xC0) == 0xC0, which misses bytes in the range 0x80-0xBF. Shift-JIS lead/continuation bytes such as 0x83 satisfy (0x83 & 0xC0) == 0x80, so the re-decode branch was never entered and the raw bytes were returned garbled. 2. The unconditional Encoding.UTF8.GetString() call silently replaced invalid bytes with U+FFFD instead of throwing, causing silent data loss for Shift-JIS, GBK, Big5, EUC-KR, and similar legacy encodings. Fix --- - Widen non-ASCII detection from (& 0xC0)==0xC0 to (& 0x80)!=0 so every byte >= 0x80 triggers the re-decode path (covers UTF-8 lead bytes and all legacy encodings). - Replace unconditional UTF-8 decode with a strict UTF-8 decoder (throwOnInvalidBytes: true) wrapped in try/catch; on DecoderFallbackException fall back to the ANSI code page of the current culture via PdfEncoders.AnsiCodepageEncoding. - Add AnsiCodepageEncoding property to PdfEncoders that resolves the ANSI code page for the current culture (CP932 for ja-JP, CP936 for zh-CN, CP950 for zh-TW, etc.). - Register CodePagesEncodingProvider once at startup on non-.NET-Framework targets so that Encoding.GetEncoding(codePage) resolves legacy code pages. - Add System.Text.Encoding.CodePages NuGet reference for netstandard2.0 targets (net462 already includes all code pages natively). - Add unit tests covering UTF-8 names, non-UTF-8 no-throw, and Shift-JIS fallback. Fixes empira/PDFsharp#364 --- src/Directory.Packages.props | 1 + .../src/PdfSharp/Pdf.Content/CLexer.cs | 21 ++++-- .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs | 21 ++++-- .../src/PdfSharp/Pdf.Internal/PdfEncoders.cs | 52 ++++++++++++--- .../src/PDFsharp/src/PdfSharp/PdfSharp.csproj | 4 ++ .../tests/PdfSharp.Tests/IO/LexerTests.cs | 65 +++++++++++++++++++ 6 files changed, 144 insertions(+), 20 deletions(-) diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index dcb63a2f..4360e29f 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -37,6 +37,7 @@ + diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs index 85102697..84162976 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs @@ -234,20 +234,28 @@ static char LogError(char ch) } var name = Token; - // Check token for UTF-8 encoding. + // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding. for (int idx = 0; idx < name.Length; idx++) { - // If the two top most significant bits are set this identifies a 2, 3, or 4 - // byte UTF-8 encoding sequence. - if ((name[idx] & 0xC0) == 0xC0) + if ((name[idx] & 0x80) != 0) { - // Special characters in Name objects use UTF-8 encoding. + // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding. var length = name.Length; var bytes = new byte[length]; for (int idx2 = 0; idx2 < length; idx2++) bytes[idx2] = (byte)name[idx2]; - var decodedName = Encoding.UTF8.GetString(bytes); + string decodedName; + try + { + // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences. + decodedName = StrictUtf8.GetString(bytes); + } + catch (DecoderFallbackException) + { + // Fallback to ANSI code page encoding if UTF-8 decoding fails. + decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes ); + } _token.Clear(); _token.Append(decodedName); break; @@ -498,6 +506,7 @@ public CSymbol ScanNumber() return Symbol = CSymbol.Real; // CLexer returns "Real" because there is no "LongInteger". } + static readonly UTF8Encoding StrictUtf8 = new(false, true); static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000]; /// diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs index 738b49dd..d9954651 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs @@ -247,20 +247,28 @@ static char LogError(char ch) } var name = Token; - // Check for UTF-8 encoding. + // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding. for (int idx = 0; idx < name.Length; idx++) { - // If the two top most significant bits are set this identifies a 2, 3, or 4 - // byte UTF-8 encoding sequence. - if ((name[idx] & 0xC0) == 0xC0) + if ((name[idx] & 0x80) != 0) { - // Special characters in Name objects use UTF-8 encoding. + // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding. var length = name.Length; var bytes = new byte[length]; for (int idx2 = 0; idx2 < length; idx2++) bytes[idx2] = (byte)name[idx2]; - var decodedName = Encoding.UTF8.GetString(bytes); + string decodedName; + try + { + // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences. + decodedName = StrictUtf8.GetString(bytes); + } + catch (DecoderFallbackException) + { + // Fallback to ANSI code page encoding if UTF-8 decoding fails. + decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes ); + } _token.Clear(); _token.Append(decodedName); break; @@ -558,6 +566,7 @@ int TryReadReference() } } + static readonly UTF8Encoding StrictUtf8 = new(false, true); static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000]; /// diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs index 7a6245b4..7925d680 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs @@ -17,14 +17,36 @@ namespace PdfSharp.Pdf.Internal /// public static class PdfEncoders { - /// - /// Gets the PDFsharp specific encoder RawEncoding. - /// Ray encoding allows wo work with string instead of byte array. - /// A raw encoded string is equivalent to a byte array of the same length - /// where each sting character represents one byte. - /// Therefore, each character of a raw string has a value less than 256. - /// - public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding(); + static PdfEncoders() + { +#if !NETFRAMEWORK +#if NETSTANDARD + // netstandard2.0 は .NET Framework 上でも動作しうるため、ランタイムで判定 + // .NET Framework は、CodePagesEncodingProvider を必要としないため、コンポーネントがないため呼び出し自体を避ける + if (System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription.StartsWith(".NET Framework", StringComparison.OrdinalIgnoreCase)) + return; +#endif + RegisterCodePages(); +#endif + } +#if !NETFRAMEWORK + private static void RegisterCodePages() + { + // Register CodePagesEncodingProvider so that legacy encodings like Shift-JIS (CP932), + // GBK (CP936), Big5 (CP950), EUC-KR (CP949), etc. are available on all platforms. + // On .NET Framework, all code pages are natively available and registration is not needed. + Encoding.RegisterProvider( CodePagesEncodingProvider.Instance ); + } +#endif + + /// + /// Gets the PDFsharp specific encoder RawEncoding. + /// Ray encoding allows wo work with string instead of byte array. + /// A raw encoded string is equivalent to a byte array of the same length + /// where each sting character represents one byte. + /// Therefore, each character of a raw string has a value less than 256. + /// + public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding(); static Encoding? _rawEncoding; internal static Encoding ByteStringEncoding => _rawEncoding ??= new RawEncoding(); // new name?? @@ -64,6 +86,20 @@ public static Encoding WinAnsiEncoding public static Encoding UnicodeEncoding => _unicodeEncoding ??= Encoding.Unicode; static Encoding? _unicodeEncoding; + /// + /// Gets an encoding that corresponds to the ANSI code page of the current culture. + /// + /// The returned encoding is determined by the ANSI code page associated with the current + /// thread's culture. If the current culture does not define an ANSI code page, a default encoding such as + /// Latin1 may be used. This property is useful for interoperability with legacy systems or file formats that + /// rely on culture-specific encodings. + public static Encoding AnsiCodepageEncoding => +#if NET5_0_OR_GREATER + CultureInfo.CurrentCulture.TextInfo.ANSICodePage != 0 ? Encoding.GetEncoding( CultureInfo.CurrentCulture.TextInfo.ANSICodePage ) : Encoding.Latin1; +#else + Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage); +#endif + ///// ///// Encodes a string from a byte array. Each character gets the code of the corresponding byte. ///// diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj index f7ff7380..27a05eb7 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj +++ b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj @@ -70,6 +70,10 @@ + + + + diff --git a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs index 690aa917..610ecdcd 100644 --- a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs +++ b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs @@ -258,6 +258,71 @@ public void Scan_ObjRef_Tests(string text, (int, int) objID/*, bool testAsLong, } + [Fact] + public void ScanName_UTF8_encoded_name_is_decoded_correctly() + { + // UTF-8 encoding of "日本語" is E6 97 A5 E6 9C AC E8 AA 9E. + // ScanName returns the token INCLUDING the leading '/', e.g. "/日本語". + var nameBytes = System.Text.Encoding.UTF8.GetBytes("日本語"); + var bytes = new byte[1 + nameBytes.Length + 1]; + bytes[0] = (byte)'/'; + nameBytes.CopyTo(bytes, 1); + bytes[^1] = (byte)' '; + + var lexer = CreateLexerFromBytes(bytes); + var symbol = lexer.ScanName(); + + symbol.Should().Be(Symbol.Name); + lexer.Token.Should().Be("/日本語"); + } + + [Fact] + public void ScanName_NonUTF8_bytes_do_not_throw() + { + // 0xE3 0x81 is an invalid UTF-8 sequence (incomplete 3-byte sequence). + // Regardless of platform, this must not throw and must return some string. + byte[] bytes = [(byte)'/', 0xE3, 0x81, (byte)' ']; + + var lexer = CreateLexerFromBytes(bytes); + var symbol = lexer.ScanName(); + + symbol.Should().Be(Symbol.Name); + lexer.Token.Should().NotBeNullOrEmpty(); + lexer.Token.Should().StartWith("/"); + } + +#if NET8_0_OR_GREATER + [SkippableFact] + public void ScanName_ShiftJIS_name_is_decoded_on_ShiftJIS_default_encoding() + { + // This test verifies Shift-JIS decoding when Encoding.Default is CP932. + // On non-Japanese environments, Encoding.Default may not be CP932 and + // the decoded string will differ, so we skip this test if CP932 is not the default. + Skip.If(System.Text.Encoding.Default.CodePage != 932, + "Requires Encoding.Default to use code page 932 (Shift-JIS)."); + + // Shift-JIS encoding of "日本語": 93 FA 96 D1 8C EA + // ScanName returns the token INCLUDING the leading '/'. + byte[] sjisBytes = [0x93, 0xFA, 0x96, 0xD1, 0x8C, 0xEA]; + byte[] bytes = new byte[1 + sjisBytes.Length + 1]; + bytes[0] = (byte)'/'; + sjisBytes.CopyTo(bytes, 1); + bytes[^1] = (byte)' '; + + var lexer = CreateLexerFromBytes(bytes); + var symbol = lexer.ScanName(); + + symbol.Should().Be(Symbol.Name); + lexer.Token.Should().Be("/日本語"); + } +#endif + + static Lexer CreateLexerFromBytes(byte[] bytes) + { + var stream = new MemoryStream(bytes); + return new Lexer(stream, null); + } + Lexer CreateLexer(string text) { var pdfString = new PdfString(text, PdfStringEncoding.RawEncoding);