Files
2021-05-24 22:18:33 +03:00

201 lines
7.5 KiB
Python

"""The the pyzor.digest module
"""
import sys
import hashlib
import unittest
from pyzor.digest import *
HTML_TEXT = """<html><head><title>Email spam</title></head><body>
<p><b>Email spam</b>, also known as <b>junk email</b>
or <b>unsolicited bulk email</b> (<i>UBE</i>), is a subset of
<a href="/wiki/Spam_(electronic)" title="Spam (electronic)">electronic spam</a>
involving nearly identical messages sent to numerous recipients by <a href="/wiki/Email" title="Email">
email</a>. Clicking on <a href="/wiki/Html_email#Security_vulnerabilities" title="Html email" class="mw-redirect">
links in spam email</a> may send users to <a href="/wiki/Phishing" title="Phishing">phishing</a>
web sites or sites that are hosting <a href="/wiki/Malware" title="Malware">malware</a>.</body></html>"""
HTML_TEXT_STRIPED = 'Email spam Email spam , also known as junk email or unsolicited bulk email ( UBE ),'\
' is a subset of electronic spam involving nearly identical messages sent to numerous recipients by email'\
' . Clicking on links in spam email may send users to phishing web sites or sites that are hosting malware .'
class HTMLStripperTests(unittest.TestCase):
def setUp(self):
unittest.TestCase.setUp(self)
self.data = []
def tearDown(self):
unittest.TestCase.tearDown(self)
def test_HTMLStripper(self):
stripper = HTMLStripper(self.data)
stripper.feed(HTML_TEXT)
res = " ".join(self.data)
self.assertEqual(res, HTML_TEXT_STRIPED)
class PreDigestTests(unittest.TestCase):
def setUp(self):
unittest.TestCase.setUp(self)
self.lines = []
def mock_digest_paylods(c, message):
yield message.decode("utf8")
def mock_handle_line(s, line):
self.lines.append(line.decode("utf8"))
self.real_digest_payloads = DataDigester.digest_payloads
self.real_handle_line = DataDigester.handle_line
DataDigester.digest_payloads = mock_digest_paylods
DataDigester.handle_line = mock_handle_line
def tearDown(self):
unittest.TestCase.tearDown(self)
DataDigester.digest_payloads = self.real_digest_payloads
DataDigester.handle_line = self.real_handle_line
def test_predigest_emails(self):
"""Test email removal in the predigest process"""
real_longstr = DataDigester.longstr_ptrn
DataDigester.longstr_ptrn = re.compile(r'\S{100,}')
emails = ["test@example.com",
"test123@example.com",
"test+abc@example.com",
"test.test2@example.com",
"test.test2+abc@example.com", ]
message = "Test %s Test2"
expected = "TestTest2"
try:
for email in emails:
self.lines = []
DataDigester((message % email).encode("utf8"))
self.assertEqual(self.lines[0], expected)
finally:
DataDigester.longstr_ptrn = real_longstr
# XXX This fails
# def test_predigest_emails_whitespace(self):
# real_longstr = DataDigester.longstr_ptrn
# DataDigester.longstr_ptrn = re.compile(r'\S{100,}')
# emails = ["chirila@example. com",
# "chirila@example . com",
# "chirila @example. com",
# "chirila@ example. com",
# "chirila @example . com",
# "chirila @ example. com",
# "chirila @ example . com",]
# message = "Test %s Test2"
# expected = "TestTest2"
# try:
# for email in emails:
# self.lines = []
# DataDigester(message % email)
# self.assertEqual(self.lines[0], expected)
# finally:
# DataDigester.longstr_ptrn = real_longstr
def test_predigest_urls(self):
"""Test url removal in the predigest process"""
real_longstr = DataDigester.longstr_ptrn
DataDigester.longstr_ptrn = re.compile(r'\S{100,}')
urls = ["http://www.example.com",
# "www.example.com", # XXX This also fail
"http://example.com",
# "example.com", # XXX This also fails
"http://www.example.com/test/"
"http://www.example.com/test/test2", ]
message = "Test %s Test2"
expected = "TestTest2"
try:
for url in urls:
self.lines = []
DataDigester((message % url).encode("utf8"))
self.assertEqual(self.lines[0], expected)
finally:
DataDigester.longstr_ptrn = real_longstr
def test_predigest_long(self):
"""Test long "words" removal in the predigest process"""
strings = ["0A2D3f%a#S",
"3sddkf9jdkd9",
"@@#@@@@@@@@@"]
message = "Test %s Test2"
expected = "TestTest2"
for string in strings:
self.lines = []
DataDigester((message % string).encode("utf8"))
self.assertEqual(self.lines[0], expected)
def test_predigest_min_line_lenght(self):
"""Test small lines removal in the predigest process"""
message = "This line is included\n"\
"not this\n"\
"This also"
expected = ["Thislineisincluded", "Thisalso"]
DataDigester(message.encode("utf8"))
self.assertEqual(self.lines, expected)
def test_predigest_atomic(self):
"""Test atomic messages (lines <= 4) in the predigest process"""
message = "All this message\nShould be included\nIn the predigest"
expected = ["Allthismessage", "Shouldbeincluded", "Inthepredigest"]
DataDigester(message.encode("utf8"))
self.assertEqual(self.lines, expected)
def test_predigest_pieced(self):
"""Test pieced messages (lines > 4) in the predigest process"""
message = ""
for i in range(100):
message += "Line%d test test test\n" % i
expected = []
for i in [20, 21, 22, 60, 61, 62]:
expected.append("Line%dtesttesttest" % i)
DataDigester(message.encode("utf8"))
self.assertEqual(self.lines, expected)
class DigestTests(unittest.TestCase):
def setUp(self):
unittest.TestCase.setUp(self)
self.lines = []
def mock_digest_paylods(c, message):
yield message.decode("utf8")
self.real_digest_payloads = DataDigester.digest_payloads
DataDigester.digest_payloads = mock_digest_paylods
def tearDown(self):
unittest.TestCase.tearDown(self)
DataDigester.digest_payloads = self.real_digest_payloads
def test_digest(self):
message = b"That's some good ham right there"
predigested = b"That'ssomegoodhamrightthere"
digest = hashlib.sha1()
digest.update(predigested)
expected = digest.hexdigest()
result = DataDigester(message).value
self.assertEqual(result, expected)
def suite():
"""Gather all the tests from this module in a test suite."""
test_suite = unittest.TestSuite()
test_suite.addTest(unittest.makeSuite(HTMLStripperTests))
test_suite.addTest(unittest.makeSuite(PreDigestTests))
test_suite.addTest(unittest.makeSuite(DigestTests))
return test_suite
if __name__ == '__main__':
unittest.main()