"""The the pyzor.digest module """ import sys import hashlib import unittest from pyzor.digest import * HTML_TEXT = """Email spam

Email spam, also known as junk email or unsolicited bulk email (UBE), is a subset of electronic spam involving nearly identical messages sent to numerous recipients by email. Clicking on links in spam email may send users to phishing web sites or sites that are hosting malware.""" HTML_TEXT_STRIPED = 'Email spam Email spam , also known as junk email or unsolicited bulk email ( UBE ),'\ ' is a subset of electronic spam involving nearly identical messages sent to numerous recipients by email'\ ' . Clicking on links in spam email may send users to phishing web sites or sites that are hosting malware .' class HTMLStripperTests(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.data = [] def tearDown(self): unittest.TestCase.tearDown(self) def test_HTMLStripper(self): stripper = HTMLStripper(self.data) stripper.feed(HTML_TEXT) res = " ".join(self.data) self.assertEqual(res, HTML_TEXT_STRIPED) class PreDigestTests(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.lines = [] def mock_digest_paylods(c, message): yield message.decode("utf8") def mock_handle_line(s, line): self.lines.append(line.decode("utf8")) self.real_digest_payloads = DataDigester.digest_payloads self.real_handle_line = DataDigester.handle_line DataDigester.digest_payloads = mock_digest_paylods DataDigester.handle_line = mock_handle_line def tearDown(self): unittest.TestCase.tearDown(self) DataDigester.digest_payloads = self.real_digest_payloads DataDigester.handle_line = self.real_handle_line def test_predigest_emails(self): """Test email removal in the predigest process""" real_longstr = DataDigester.longstr_ptrn DataDigester.longstr_ptrn = re.compile(r'\S{100,}') emails = ["test@example.com", "test123@example.com", "test+abc@example.com", "test.test2@example.com", "test.test2+abc@example.com", ] message = "Test %s Test2" expected = "TestTest2" try: for email in emails: self.lines = [] DataDigester((message % email).encode("utf8")) self.assertEqual(self.lines[0], expected) finally: DataDigester.longstr_ptrn = real_longstr # XXX This fails # def test_predigest_emails_whitespace(self): # real_longstr = DataDigester.longstr_ptrn # DataDigester.longstr_ptrn = re.compile(r'\S{100,}') # emails = ["chirila@example. com", # "chirila@example . com", # "chirila @example. com", # "chirila@ example. com", # "chirila @example . com", # "chirila @ example. com", # "chirila @ example . com",] # message = "Test %s Test2" # expected = "TestTest2" # try: # for email in emails: # self.lines = [] # DataDigester(message % email) # self.assertEqual(self.lines[0], expected) # finally: # DataDigester.longstr_ptrn = real_longstr def test_predigest_urls(self): """Test url removal in the predigest process""" real_longstr = DataDigester.longstr_ptrn DataDigester.longstr_ptrn = re.compile(r'\S{100,}') urls = ["http://www.example.com", # "www.example.com", # XXX This also fail "http://example.com", # "example.com", # XXX This also fails "http://www.example.com/test/" "http://www.example.com/test/test2", ] message = "Test %s Test2" expected = "TestTest2" try: for url in urls: self.lines = [] DataDigester((message % url).encode("utf8")) self.assertEqual(self.lines[0], expected) finally: DataDigester.longstr_ptrn = real_longstr def test_predigest_long(self): """Test long "words" removal in the predigest process""" strings = ["0A2D3f%a#S", "3sddkf9jdkd9", "@@#@@@@@@@@@"] message = "Test %s Test2" expected = "TestTest2" for string in strings: self.lines = [] DataDigester((message % string).encode("utf8")) self.assertEqual(self.lines[0], expected) def test_predigest_min_line_lenght(self): """Test small lines removal in the predigest process""" message = "This line is included\n"\ "not this\n"\ "This also" expected = ["Thislineisincluded", "Thisalso"] DataDigester(message.encode("utf8")) self.assertEqual(self.lines, expected) def test_predigest_atomic(self): """Test atomic messages (lines <= 4) in the predigest process""" message = "All this message\nShould be included\nIn the predigest" expected = ["Allthismessage", "Shouldbeincluded", "Inthepredigest"] DataDigester(message.encode("utf8")) self.assertEqual(self.lines, expected) def test_predigest_pieced(self): """Test pieced messages (lines > 4) in the predigest process""" message = "" for i in range(100): message += "Line%d test test test\n" % i expected = [] for i in [20, 21, 22, 60, 61, 62]: expected.append("Line%dtesttesttest" % i) DataDigester(message.encode("utf8")) self.assertEqual(self.lines, expected) class DigestTests(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.lines = [] def mock_digest_paylods(c, message): yield message.decode("utf8") self.real_digest_payloads = DataDigester.digest_payloads DataDigester.digest_payloads = mock_digest_paylods def tearDown(self): unittest.TestCase.tearDown(self) DataDigester.digest_payloads = self.real_digest_payloads def test_digest(self): message = b"That's some good ham right there" predigested = b"That'ssomegoodhamrightthere" digest = hashlib.sha1() digest.update(predigested) expected = digest.hexdigest() result = DataDigester(message).value self.assertEqual(result, expected) def suite(): """Gather all the tests from this module in a test suite.""" test_suite = unittest.TestSuite() test_suite.addTest(unittest.makeSuite(HTMLStripperTests)) test_suite.addTest(unittest.makeSuite(PreDigestTests)) test_suite.addTest(unittest.makeSuite(DigestTests)) return test_suite if __name__ == '__main__': unittest.main()