To make all of our hard work easier to use, we need to pack it all up into a single, neat function, as shown:
# remake a simple two char CV
two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False)
two_char = two_cv.fit_transform(text)
two_char
# there are 7,528 unique 2-in-a-row-chars (number of columns)
<1048485x7528 sparse matrix of type '<type 'numpy.int64'>' with 14350326 stored elements in Compressed Sparse Row format>
# make a simple function using the two_char CV and matrix
def get_closest_word_similarity(password):
raw_vectorization = cosine_similarity(two_cv.transform([password]), two_char)
return raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()
This function makes it easier to judge passwords quickly:
print get_closest_word_similarity("guest123") # very close to passwords in the db
0.789113817
print get_closest_word_similarity("sdfFSKSJNDFKFSD3253245sadSDF@@$@#$") # not very close to passwords in the db
0.47148393
We can take this one step further and create a custom password-tester class that will store in-memory vectorizations of passwords to make our algorithm easy to share:
# this is a complete data-driven automated password strength tester that judges passwords without any human intuition.
class PasswordTester():
def __init__(self, text):
self.vectorizer = None
self.password_matrix = None
self.text = text
def make_vectorizer(self, **kwargs):
self.vectorizer = CountVectorizer(**kwargs)
self.password_matrix = self.vectorizer.fit_transform(self.text)
def get_closest_word_similarity(self, password):
raw_vectorization = cosine_similarity(self.vectorizer.transform([password]), self.password_matrix)
return raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()
def judge_password(self, attempted_password):
badness_score = self.get_closest_word_similarity(attempted_password)
if badness_score > .9:
return "very poor", badness_score
elif badness_score > .8:
return "poor", badness_score
elif badness_score > .6:
return "not bad", badness_score
elif badness_score > .4:
return "good", badness_score
else:
return "very good", badness_score
To use our custom class, we can instantiate it with custom vectorization parameters:
p = PasswordTester(text)
p.make_vectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False)
p.judge_password("password123321")
('poor', 0.8624222257655552)
p.judge_password("Istanbul9999")
('not bad', 0.7928432151071905)
# generated from LastPass, a password management and creation service 10 digit
p.judge_password("D9GLRyG0*!")
('good', 0.41329460236856164)
# generated from LastPass, 100 digit
p.judge_password("ES%9G1UxtoBlwn^e&Bz3bAj2hMfk!2cfj8kF8yUc&J2B&khzNpBoe65Va!*XGXH1&PF5fxbKGpBsvPNQdnmnWyzb@W$tcn^%fnKa")
('very good', 0.3628996523892102)