From 2d52c17cabed4811e4eac46af2cfe775d46d540b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sat, 6 Jun 2026 00:41:37 +0200 Subject: [PATCH 1/8] enhanced codeblock regex and made stricter requirements for valid codeblocks --- bot/exts/info/codeblock/_parsing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index abad09eef1..0408692295 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -35,11 +35,11 @@ fr""" (?P (?P[{''.join(_TICKS)}]) # Put all ticks into a character class within a group. - \2{{2}} # Match previous group 2 more times to ensure the same char. + \2* # Match previous group up to N more times to ensure the same char. ) - (?P[A-Za-z0-9\+\-\.]+\n)? # Optionally match a language specifier followed by a newline. + (?P[A-Za-z0-9+\-.]+\s)? # Optionally match a language specifier followed by a whitespace. (?P.+?) # Match the actual code within the block. - \1 # Match the same 3 ticks used at the start of the block. + \1 # Match the same N ticks used at the start of the block. """, re.DOTALL | re.VERBOSE ) @@ -86,9 +86,9 @@ def find_code_blocks(message: str) -> Sequence[CodeBlock] | None: for match in _RE_CODE_BLOCK.finditer(message): # Used to ensure non-matched groups have an empty string as the default value. groups = match.groupdict("") - language = groups["lang"].strip() # Strip the newline cause it's included in the group. + language = groups["lang"].strip() # Strip the whitespace cause it's included in the group. - if groups["tick"] == BACKTICK and language: + if groups["tick"] == BACKTICK and len(groups["ticks"]) == 3 and language and ("\n" in groups["lang"]): log.trace("Message has a valid code block with a language; returning None.") return None if has_lines(groups["code"], constants.CodeBlock.minimum_lines): From e23f230c85899ae40e7db2311c8447112cd136f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sat, 6 Jun 2026 00:52:40 +0200 Subject: [PATCH 2/8] added empty newline for readability --- bot/exts/info/codeblock/_parsing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index 0408692295..e19934ba7a 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -91,6 +91,7 @@ def find_code_blocks(message: str) -> Sequence[CodeBlock] | None: if groups["tick"] == BACKTICK and len(groups["ticks"]) == 3 and language and ("\n" in groups["lang"]): log.trace("Message has a valid code block with a language; returning None.") return None + if has_lines(groups["code"], constants.CodeBlock.minimum_lines): code_block = CodeBlock(groups["code"], language, groups["tick"]) code_blocks.append(code_block) From 72a988b976eb3fa3f1a3fb2a8e1ddb25e6f5a7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sat, 6 Jun 2026 00:53:41 +0200 Subject: [PATCH 3/8] minor code cleanup --- bot/exts/info/codeblock/_parsing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index e19934ba7a..0569b1433e 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -182,7 +182,7 @@ def parse_bad_language(content: str) -> BadLanguage | None: ) -def _get_leading_spaces(content: str) -> int: +def _get_leading_spaces(content: str) -> int | None: """Return the number of spaces at the start of the first line in `content`.""" leading_spaces = 0 for char in content: @@ -190,6 +190,7 @@ def _get_leading_spaces(content: str) -> int: leading_spaces += 1 else: return leading_spaces + return None From 9b9bdb12d14db84f33adeef9c8bdb2fbfa364cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sun, 7 Jun 2026 00:32:51 +0200 Subject: [PATCH 4/8] added unit tests for the codeblock detection --- tests/bot/exts/info/codeblock/__init__.py | 0 tests/bot/exts/info/codeblock/test_parsing.py | 153 ++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 tests/bot/exts/info/codeblock/__init__.py create mode 100644 tests/bot/exts/info/codeblock/test_parsing.py diff --git a/tests/bot/exts/info/codeblock/__init__.py b/tests/bot/exts/info/codeblock/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/bot/exts/info/codeblock/test_parsing.py b/tests/bot/exts/info/codeblock/test_parsing.py new file mode 100644 index 0000000000..53353fadbd --- /dev/null +++ b/tests/bot/exts/info/codeblock/test_parsing.py @@ -0,0 +1,153 @@ +import unittest + +from bot.exts.info.codeblock import _parsing as parsing + + +class FindFaultyCodeblocksTest(unittest.TestCase): + def test_should_recognize_missing_language(self): + message = """``` +x = 4 +y = 2 +print("abc") +```""" + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) + + def test_should_recognize_contained_codeblock(self): + message = """' +wouldn't it be easier to do: +```py +say_hi = lambda: + print('hello') + print('world') +say_hi() + +' +``` + +'""" + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNone(faulty_code_blocks) + + def test_should_recognize_contained_codeblock_even_if_that_breaks_formatting(self): + message = """``` + ```py + x = 4 + y = 3 + z = 2 + print("abc") + ``` + ``` + """ + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNone(faulty_code_blocks) + + def test_should_not_recognize_normal_single_quotes(self): + """normal single quotes refers to single quotes that appear normally in text, + like for example in "I'll", "We're", etc.""" + message = """I'm writing line 1 + and we're writing line 2 + we'll also be checking another of those + and some odd 'variations + isn't it beautiful?""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 0) + + def test_should_not_recognize_quoting_single_quotes(self): + message = """ 'I am doing a long quote. + Sure, I could just use the > character + for correct quoting + but whatever... + End of quote' """ + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 0) + + + def test_should_not_recognize_normal_double_quotes(self): + """normal double quotes refer to double quotes that appear normally in text to quote something""" + message = """ "I am doing a long quote. + Sure, I could just use the > character + for correct quoting + but whatever... + End of quote" """ + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 0) + + def test_should_not_recognize_normal_double_quotes_python_text(self): + message = """ "python is a great language + great + great + great language + enough lines? + yes" """ + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 0) + + def test_should_recognize_single_backtick_no_language(self): + message = """` + x = 4 + y = 3 + z = 2 + print("abc") + `""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) + + def test_should_recognize_single_backtick_with_language(self): + message = """`py + x = 4 + y = 3 + z = 2 + print("abc") + `""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) + + def test_should_recognize_single_single_quote_with_py_language(self): + message = """'py + x = 4 + y = 3 + z = 2 + print("abc") + '""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) + + def test_should_recognize_single_single_quote_with_python_language(self): + message = """'python + x = 4 + y = 3 + z = 2 + print("abc") + '""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) + + def test_should_recognize_wrong_number_of_backticks(self): + message = """``py + x = 4 + y = 3 + z = 2 + print("abc") + ``""" + + faulty_code_blocks = parsing.find_faulty_code_blocks(message) + self.assertIsNotNone(faulty_code_blocks) + self.assertEqual(len(faulty_code_blocks), 1) From f272a86b7f449fe02a7b8a51550b30bc86ac2195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sun, 7 Jun 2026 00:35:28 +0200 Subject: [PATCH 5/8] made faulty codeblock detection more restrictive and improved bad ticks error message --- bot/exts/info/codeblock/_instructions.py | 4 ++-- bot/exts/info/codeblock/_parsing.py | 25 ++++++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/bot/exts/info/codeblock/_instructions.py b/bot/exts/info/codeblock/_instructions.py index 210217ccaf..e855717aab 100644 --- a/bot/exts/info/codeblock/_instructions.py +++ b/bot/exts/info/codeblock/_instructions.py @@ -38,7 +38,7 @@ def _get_bad_ticks_message(code_block: _parsing.CodeBlock) -> str | None: valid_ticks = f"\\{_parsing.BACKTICK}" * 3 instructions = ( "You are using the wrong character instead of backticks. " - f"Use {valid_ticks}, not `{code_block.tick * 3}`." + f"Use {valid_ticks}, not `{code_block.ticks}`." ) log.trace("Check if the bad ticks code block also has issues with the language specifier.") @@ -138,7 +138,7 @@ def get_instructions(content: str) -> str | None: """ log.trace("Getting formatting instructions.") - blocks = _parsing.find_code_blocks(content) + blocks = _parsing.find_faulty_code_blocks(content) if blocks is None: log.trace("At least one valid code block found; no instructions to return.") return None diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index 0569b1433e..bac6b862f7 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -6,6 +6,8 @@ from collections.abc import Sequence from typing import NamedTuple +import regex + from bot import constants from bot.log import get_logger from bot.utils import has_lines @@ -33,6 +35,8 @@ _RE_CODE_BLOCK = re.compile( fr""" + (?:^| # the ticks need to start at the front of a line to be recognized + \s) # or need to have a preceeding whitespace (to avoid detection of words like I'll). (?P (?P[{''.join(_TICKS)}]) # Put all ticks into a character class within a group. \2* # Match previous group up to N more times to ensure the same char. @@ -43,6 +47,7 @@ """, re.DOTALL | re.VERBOSE ) +_RE_CODE_BLOCK_REGEX = regex.compile(_RE_CODE_BLOCK.pattern, regex.DOTALL | regex.VERBOSE) _RE_LANGUAGE = re.compile( fr""" @@ -59,6 +64,7 @@ class CodeBlock(NamedTuple): content: str language: str + ticks: str tick: str @@ -70,9 +76,9 @@ class BadLanguage(NamedTuple): has_terminal_newline: bool -def find_code_blocks(message: str) -> Sequence[CodeBlock] | None: +def find_faulty_code_blocks(message: str) -> Sequence[CodeBlock] | None: """ - Find and return all Markdown code blocks in the `message`. + Find and return all faulty Markdown code blocks in the `message`. Code blocks with 3 or fewer lines are excluded. @@ -83,7 +89,7 @@ def find_code_blocks(message: str) -> Sequence[CodeBlock] | None: log.trace("Finding all code blocks in a message.") code_blocks = [] - for match in _RE_CODE_BLOCK.finditer(message): + for match in _RE_CODE_BLOCK_REGEX.finditer(message, overlapped=True): # Used to ensure non-matched groups have an empty string as the default value. groups = match.groupdict("") language = groups["lang"].strip() # Strip the whitespace cause it's included in the group. @@ -92,11 +98,18 @@ def find_code_blocks(message: str) -> Sequence[CodeBlock] | None: log.trace("Message has a valid code block with a language; returning None.") return None - if has_lines(groups["code"], constants.CodeBlock.minimum_lines): - code_block = CodeBlock(groups["code"], language, groups["tick"]) + if not has_lines(groups["code"], constants.CodeBlock.minimum_lines): + log.trace("Skipped a code block shorter than 4 lines.") + continue + + if (groups["tick"] == BACKTICK + or (language in PY_LANG_CODES and is_python_code(groups["code"])) + or len(groups["ticks"]) >= 2): + log.trace("Message has an invalid code block.") + code_block = CodeBlock(groups["code"], language, groups["ticks"], groups["tick"]) code_blocks.append(code_block) else: - log.trace("Skipped a code block shorter than 4 lines.") + log.trace("Skipped invalid code block due to uncertainty if it is supposed to be a code block.") return code_blocks From eb4a021b8ef01292fe6269abc95e3bb7076bfbc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6tttgermann?= Date: Sun, 7 Jun 2026 00:50:18 +0200 Subject: [PATCH 6/8] improved string formatting in the tests --- tests/bot/exts/info/codeblock/test_parsing.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/bot/exts/info/codeblock/test_parsing.py b/tests/bot/exts/info/codeblock/test_parsing.py index 53353fadbd..4507fcaa54 100644 --- a/tests/bot/exts/info/codeblock/test_parsing.py +++ b/tests/bot/exts/info/codeblock/test_parsing.py @@ -6,27 +6,27 @@ class FindFaultyCodeblocksTest(unittest.TestCase): def test_should_recognize_missing_language(self): message = """``` -x = 4 -y = 2 -print("abc") -```""" + x = 4 + y = 2 + print("abc") + ```""" faulty_code_blocks = parsing.find_faulty_code_blocks(message) self.assertIsNotNone(faulty_code_blocks) self.assertEqual(len(faulty_code_blocks), 1) def test_should_recognize_contained_codeblock(self): message = """' -wouldn't it be easier to do: -```py -say_hi = lambda: - print('hello') - print('world') -say_hi() + wouldn't it be easier to do: + ```py + say_hi = lambda: + print('hello') + print('world') + say_hi() -' -``` + ' + ``` -'""" + '""" faulty_code_blocks = parsing.find_faulty_code_blocks(message) self.assertIsNone(faulty_code_blocks) From d92d178c7ec8cf1f5bc52a7d852335e66b277437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20R=C3=B6ttgermann?= <38440557+SamuelRoettgermann@users.noreply.github.com> Date: Sun, 7 Jun 2026 01:08:46 +0200 Subject: [PATCH 7/8] Update bot/exts/info/codeblock/_parsing.py Update regex comment Co-authored-by: Joe Banks --- bot/exts/info/codeblock/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index bac6b862f7..c6f1ef2de2 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -36,7 +36,7 @@ _RE_CODE_BLOCK = re.compile( fr""" (?:^| # the ticks need to start at the front of a line to be recognized - \s) # or need to have a preceeding whitespace (to avoid detection of words like I'll). + \s) # or need to have a preceding whitespace (to avoid detection of words like I'll). (?P (?P[{''.join(_TICKS)}]) # Put all ticks into a character class within a group. \2* # Match previous group up to N more times to ensure the same char. From 2ade4a3d56c86affa4c186b8a7c1ce7c169806f1 Mon Sep 17 00:00:00 2001 From: Samuel Date: Tue, 9 Jun 2026 15:34:33 +0200 Subject: [PATCH 8/8] added is_python field for code blocks; added comment explaining regex duplication --- bot/exts/info/codeblock/_instructions.py | 8 ++++---- bot/exts/info/codeblock/_parsing.py | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/bot/exts/info/codeblock/_instructions.py b/bot/exts/info/codeblock/_instructions.py index e855717aab..7204dbeae4 100644 --- a/bot/exts/info/codeblock/_instructions.py +++ b/bot/exts/info/codeblock/_instructions.py @@ -44,7 +44,7 @@ def _get_bad_ticks_message(code_block: _parsing.CodeBlock) -> str | None: log.trace("Check if the bad ticks code block also has issues with the language specifier.") addition_msg = _get_bad_lang_message(code_block.content) if not addition_msg and not code_block.language: - addition_msg = _get_no_lang_message(code_block.content) + addition_msg = _get_no_lang_message(code_block) # Combine the back ticks message with the language specifier message. The latter will # already have an example code block. @@ -112,7 +112,7 @@ def _get_bad_lang_message(content: str) -> str | None: return None -def _get_no_lang_message(content: str) -> str | None: +def _get_no_lang_message(code_block: _parsing.CodeBlock) -> str | None: """ Return instructions on specifying a language for a code block. @@ -120,7 +120,7 @@ def _get_no_lang_message(content: str) -> str | None: """ log.trace("Creating instructions for a missing language.") - if _parsing.is_python_code(content): + if code_block.is_python: example_blocks = _get_example("py") # Note that _get_bad_ticks_message expects the first line to have two newlines. @@ -160,6 +160,6 @@ def get_instructions(content: str) -> str | None: # Check for a bad language first to avoid parsing content into an AST. instructions = _get_bad_lang_message(block.content) if not instructions: - instructions = _get_no_lang_message(block.content) + instructions = _get_no_lang_message(block) return instructions diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py index c6f1ef2de2..22cbe4344e 100644 --- a/bot/exts/info/codeblock/_parsing.py +++ b/bot/exts/info/codeblock/_parsing.py @@ -47,7 +47,8 @@ """, re.DOTALL | re.VERBOSE ) -_RE_CODE_BLOCK_REGEX = regex.compile(_RE_CODE_BLOCK.pattern, regex.DOTALL | regex.VERBOSE) +# copy of _RE_CODE_BLOCK. Done like this for highlighting reasons (regex.compile doesn't properly highlight) +_REGEX_CODE_BLOCK = regex.compile(_RE_CODE_BLOCK.pattern, regex.DOTALL | regex.VERBOSE) _RE_LANGUAGE = re.compile( fr""" @@ -66,6 +67,7 @@ class CodeBlock(NamedTuple): language: str ticks: str tick: str + is_python: bool class BadLanguage(NamedTuple): @@ -89,7 +91,7 @@ def find_faulty_code_blocks(message: str) -> Sequence[CodeBlock] | None: log.trace("Finding all code blocks in a message.") code_blocks = [] - for match in _RE_CODE_BLOCK_REGEX.finditer(message, overlapped=True): + for match in _REGEX_CODE_BLOCK.finditer(message, overlapped=True): # Used to ensure non-matched groups have an empty string as the default value. groups = match.groupdict("") language = groups["lang"].strip() # Strip the whitespace cause it's included in the group. @@ -102,11 +104,12 @@ def find_faulty_code_blocks(message: str) -> Sequence[CodeBlock] | None: log.trace("Skipped a code block shorter than 4 lines.") continue + is_python = is_python_code(groups["code"]) if (groups["tick"] == BACKTICK - or (language in PY_LANG_CODES and is_python_code(groups["code"])) + or (language in PY_LANG_CODES and is_python) or len(groups["ticks"]) >= 2): log.trace("Message has an invalid code block.") - code_block = CodeBlock(groups["code"], language, groups["ticks"], groups["tick"]) + code_block = CodeBlock(groups["code"], language, groups["ticks"], groups["tick"], is_python) code_blocks.append(code_block) else: log.trace("Skipped invalid code block due to uncertainty if it is supposed to be a code block.")