diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index dcb63a2f..4360e29f 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -37,6 +37,7 @@
+
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
index 85102697..84162976 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
@@ -234,20 +234,28 @@ static char LogError(char ch)
}
var name = Token;
- // Check token for UTF-8 encoding.
+ // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
for (int idx = 0; idx < name.Length; idx++)
{
- // If the two top most significant bits are set this identifies a 2, 3, or 4
- // byte UTF-8 encoding sequence.
- if ((name[idx] & 0xC0) == 0xC0)
+ if ((name[idx] & 0x80) != 0)
{
- // Special characters in Name objects use UTF-8 encoding.
+ // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
var length = name.Length;
var bytes = new byte[length];
for (int idx2 = 0; idx2 < length; idx2++)
bytes[idx2] = (byte)name[idx2];
- var decodedName = Encoding.UTF8.GetString(bytes);
+ string decodedName;
+ try
+ {
+ // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
+ decodedName = StrictUtf8.GetString(bytes);
+ }
+ catch (DecoderFallbackException)
+ {
+ // Fallback to ANSI code page encoding if UTF-8 decoding fails.
+ decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
+ }
_token.Clear();
_token.Append(decodedName);
break;
@@ -498,6 +506,7 @@ public CSymbol ScanNumber()
return Symbol = CSymbol.Real; // CLexer returns "Real" because there is no "LongInteger".
}
+ static readonly UTF8Encoding StrictUtf8 = new(false, true);
static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];
///
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
index 738b49dd..d9954651 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -247,20 +247,28 @@ static char LogError(char ch)
}
var name = Token;
- // Check for UTF-8 encoding.
+ // Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
for (int idx = 0; idx < name.Length; idx++)
{
- // If the two top most significant bits are set this identifies a 2, 3, or 4
- // byte UTF-8 encoding sequence.
- if ((name[idx] & 0xC0) == 0xC0)
+ if ((name[idx] & 0x80) != 0)
{
- // Special characters in Name objects use UTF-8 encoding.
+ // Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
var length = name.Length;
var bytes = new byte[length];
for (int idx2 = 0; idx2 < length; idx2++)
bytes[idx2] = (byte)name[idx2];
- var decodedName = Encoding.UTF8.GetString(bytes);
+ string decodedName;
+ try
+ {
+ // Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
+ decodedName = StrictUtf8.GetString(bytes);
+ }
+ catch (DecoderFallbackException)
+ {
+ // Fallback to ANSI code page encoding if UTF-8 decoding fails.
+ decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
+ }
_token.Clear();
_token.Append(decodedName);
break;
@@ -558,6 +566,7 @@ int TryReadReference()
}
}
+ static readonly UTF8Encoding StrictUtf8 = new(false, true);
static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];
///
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
index 7a6245b4..7925d680 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Internal/PdfEncoders.cs
@@ -17,14 +17,36 @@ namespace PdfSharp.Pdf.Internal
///
public static class PdfEncoders
{
- ///
- /// Gets the PDFsharp specific encoder RawEncoding.
- /// Ray encoding allows wo work with string instead of byte array.
- /// A raw encoded string is equivalent to a byte array of the same length
- /// where each sting character represents one byte.
- /// Therefore, each character of a raw string has a value less than 256.
- ///
- public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
+ static PdfEncoders()
+ {
+#if !NETFRAMEWORK
+#if NETSTANDARD
+ // netstandard2.0 は .NET Framework 上でも動作しうるため、ランタイムで判定
+ // .NET Framework は、CodePagesEncodingProvider を必要としないため、コンポーネントがないため呼び出し自体を避ける
+ if (System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription.StartsWith(".NET Framework", StringComparison.OrdinalIgnoreCase))
+ return;
+#endif
+ RegisterCodePages();
+#endif
+ }
+#if !NETFRAMEWORK
+ private static void RegisterCodePages()
+ {
+ // Register CodePagesEncodingProvider so that legacy encodings like Shift-JIS (CP932),
+ // GBK (CP936), Big5 (CP950), EUC-KR (CP949), etc. are available on all platforms.
+ // On .NET Framework, all code pages are natively available and registration is not needed.
+ Encoding.RegisterProvider( CodePagesEncodingProvider.Instance );
+ }
+#endif
+
+ ///
+ /// Gets the PDFsharp specific encoder RawEncoding.
+ /// Ray encoding allows wo work with string instead of byte array.
+ /// A raw encoded string is equivalent to a byte array of the same length
+ /// where each sting character represents one byte.
+ /// Therefore, each character of a raw string has a value less than 256.
+ ///
+ public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
static Encoding? _rawEncoding;
internal static Encoding ByteStringEncoding => _rawEncoding ??= new RawEncoding(); // new name??
@@ -64,6 +86,20 @@ public static Encoding WinAnsiEncoding
public static Encoding UnicodeEncoding => _unicodeEncoding ??= Encoding.Unicode;
static Encoding? _unicodeEncoding;
+ ///
+ /// Gets an encoding that corresponds to the ANSI code page of the current culture.
+ ///
+ /// The returned encoding is determined by the ANSI code page associated with the current
+ /// thread's culture. If the current culture does not define an ANSI code page, a default encoding such as
+ /// Latin1 may be used. This property is useful for interoperability with legacy systems or file formats that
+ /// rely on culture-specific encodings.
+ public static Encoding AnsiCodepageEncoding =>
+#if NET5_0_OR_GREATER
+ CultureInfo.CurrentCulture.TextInfo.ANSICodePage != 0 ? Encoding.GetEncoding( CultureInfo.CurrentCulture.TextInfo.ANSICodePage ) : Encoding.Latin1;
+#else
+ Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage);
+#endif
+
/////
///// Encodes a string from a byte array. Each character gets the code of the corresponding byte.
/////
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
index f7ff7380..27a05eb7 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
@@ -70,6 +70,10 @@
+
+
+
+
diff --git a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
index 690aa917..610ecdcd 100644
--- a/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
+++ b/src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
@@ -258,6 +258,71 @@ public void Scan_ObjRef_Tests(string text, (int, int) objID/*, bool testAsLong,
}
+ [Fact]
+ public void ScanName_UTF8_encoded_name_is_decoded_correctly()
+ {
+ // UTF-8 encoding of "日本語" is E6 97 A5 E6 9C AC E8 AA 9E.
+ // ScanName returns the token INCLUDING the leading '/', e.g. "/日本語".
+ var nameBytes = System.Text.Encoding.UTF8.GetBytes("日本語");
+ var bytes = new byte[1 + nameBytes.Length + 1];
+ bytes[0] = (byte)'/';
+ nameBytes.CopyTo(bytes, 1);
+ bytes[^1] = (byte)' ';
+
+ var lexer = CreateLexerFromBytes(bytes);
+ var symbol = lexer.ScanName();
+
+ symbol.Should().Be(Symbol.Name);
+ lexer.Token.Should().Be("/日本語");
+ }
+
+ [Fact]
+ public void ScanName_NonUTF8_bytes_do_not_throw()
+ {
+ // 0xE3 0x81 is an invalid UTF-8 sequence (incomplete 3-byte sequence).
+ // Regardless of platform, this must not throw and must return some string.
+ byte[] bytes = [(byte)'/', 0xE3, 0x81, (byte)' '];
+
+ var lexer = CreateLexerFromBytes(bytes);
+ var symbol = lexer.ScanName();
+
+ symbol.Should().Be(Symbol.Name);
+ lexer.Token.Should().NotBeNullOrEmpty();
+ lexer.Token.Should().StartWith("/");
+ }
+
+#if NET8_0_OR_GREATER
+ [SkippableFact]
+ public void ScanName_ShiftJIS_name_is_decoded_on_ShiftJIS_default_encoding()
+ {
+ // This test verifies Shift-JIS decoding when Encoding.Default is CP932.
+ // On non-Japanese environments, Encoding.Default may not be CP932 and
+ // the decoded string will differ, so we skip this test if CP932 is not the default.
+ Skip.If(System.Text.Encoding.Default.CodePage != 932,
+ "Requires Encoding.Default to use code page 932 (Shift-JIS).");
+
+ // Shift-JIS encoding of "日本語": 93 FA 96 D1 8C EA
+ // ScanName returns the token INCLUDING the leading '/'.
+ byte[] sjisBytes = [0x93, 0xFA, 0x96, 0xD1, 0x8C, 0xEA];
+ byte[] bytes = new byte[1 + sjisBytes.Length + 1];
+ bytes[0] = (byte)'/';
+ sjisBytes.CopyTo(bytes, 1);
+ bytes[^1] = (byte)' ';
+
+ var lexer = CreateLexerFromBytes(bytes);
+ var symbol = lexer.ScanName();
+
+ symbol.Should().Be(Symbol.Name);
+ lexer.Token.Should().Be("/日本語");
+ }
+#endif
+
+ static Lexer CreateLexerFromBytes(byte[] bytes)
+ {
+ var stream = new MemoryStream(bytes);
+ return new Lexer(stream, null);
+ }
+
Lexer CreateLexer(string text)
{
var pdfString = new PdfString(text, PdfStringEncoding.RawEncoding);