Using TF-IDF and cosine similarity to build a Christmas carol search engine

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

Load the carols

In [2]:
# Read the data into Panda dataframe
file_name = "carols.csv"
df = pd.read_csv('./{}'.format(file_name))

Inspect the data to check we have carols

In [3]:
# First few observations
df.head(10)
Out[3]:
Carol Lyrics
0 Have Yourself A Merry Little Christmas Have yourself a merry little Christmas,\nLet y...
1 I'll Be Home For Christmas I'll be home for Christmas\nYou can count on m...
2 It's the most wonderful time of the year It's the most wonderful time of the year.\nWit...
3 Jingle Bell Rock Jingle bell, jingle bell, jingle bell rock\nJi...
4 Jingle Bells Dashing through the snow\nOn a one-horse open ...
5 Let It Snow! Oh, the weather outside is frightful,\nBut the...
6 O Christmas Tree O Christmas Tree,\nO Christmas Tree,\nHow stea...
7 Rocking around the Christmas Tree Rocking around the Christmas tree\nat the Chri...
8 Rudolph The Red-Nosed Reindeer You know Dasher and Dancer\nAnd Prancer and Vi...
9 Santa Claus Is Coming To Town You better watch out\nYou better not cry\nBett...

Check for and remove carols with missing values

In [4]:
# Check for missing values in train data
missing_values = df.isnull().values.any()
if(missing_values):
    display(df[df.isnull().any(axis=1)])
In [5]:
# Remove records with missing values
df.dropna(inplace=True)

Determine the term frequencies (TFs)

In [6]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
cv = CountVectorizer(stop_words='english')
doc_term_matrix = cv.fit_transform(df['Lyrics'])

Perform some simple analysis

In [7]:
# Number of documents vs number of terms 
doc_term_matrix.shape
Out[7]:
(16, 550)
In [8]:
# Get the terms - unique words excluding single char words like "a"
cv.get_feature_names()
Out[8]:
['aglow',
 'ago',
 'ain',
 'air',
 'allow',
 'appear',
 'arose',
 'ashes',
 'aside',
 'awake',
 'away',
 'bad',
 'bank',
 'bay',
 'beard',
 'beautiful',
 'bed',
 'beds',
 'befalls',
 'begun',
 'bell',
 'belling',
 'bells',
 'belly',
 'better',
 'bird',
 'birds',
 'blitzen',
 'blixem',
 'blowing',
 'bluebird',
 'bob',
 'bough',
 'boughs',
 'bound',
 'bow',
 'bowlful',
 'branches',
 'breast',
 'bright',
 'bring',
 'broad',
 'brought',
 'brown',
 'build',
 'bulb',
 'bundle',
 'bushels',
 'bye',
 'called',
 'calling',
 'came',
 'cap',
 'card',
 'care',
 'caroling',
 'carols',
 'checking',
 'cheeks',
 'cheer',
 'cherry',
 'chestnuts',
 'child',
 'children',
 'chilling',
 'chime',
 'chimney',
 'chin',
 'choir',
 'christmas',
 'christmases',
 'chubby',
 'circus',
 'clatter',
 'claus',
 'clime',
 'clock',
 'clothes',
 'clown',
 'columbus',
 'comet',
 'comfort',
 'coming',
 'conspire',
 'constant',
 'corn',
 'count',
 'couple',
 'coursers',
 'crack',
 'creature',
 'cupid',
 'danced',
 'dancer',
 'dancing',
 'dash',
 'dasher',
 'dashing',
 'day',
 'days',
 'dear',
 'deck',
 'delightful',
 'dimples',
 'doesn',
 'donner',
 'doves',
 'drawn',
 'dread',
 'dream',
 'dreaming',
 'dreams',
 'dressed',
 'drew',
 'drifted',
 'driver',
 'droll',
 'drove',
 'drummers',
 'drumming',
 'dry',
 'dunder',
 'dying',
 'eagles',
 'eighth',
 'eleventh',
 'elf',
 'encircled',
 'ere',
 'eskimo',
 'eskimos',
 'eve',
 'everybody',
 'exclaim',
 'eye',
 'eyes',
 'face',
 'faith',
 'faithful',
 'fallen',
 'famous',
 'fanny',
 'fashioned',
 'fates',
 'feeling',
 'feet',
 'fell',
 'fields',
 'fifth',
 'figgy',
 'filled',
 'finally',
 'finger',
 'flash',
 'flew',
 'flung',
 'fly',
 'foggy',
 'folks',
 'foot',
 'form',
 'fourth',
 'french',
 'friends',
 'frightful',
 'frolic',
 'frost',
 'frosty',
 'fun',
 'fur',
 'games',
 'gather',
 'gave',
 'gay',
 'geese',
 'gent',
 'gets',
 'ghost',
 'giddy',
 'girls',
 'giving',
 'glad',
 'gleams',
 'glee',
 'gliding',
 'glisten',
 'glistening',
 'glories',
 'glowing',
 'glows',
 'going',
 'golden',
 'gone',
 'gonna',
 'good',
 'goodies',
 'goodness',
 'got',
 'green',
 'ground',
 'guide',
 'halls',
 'hand',
 'hang',
 'happiness',
 'happy',
 'hard',
 'hate',
 'head',
 'heads',
 'hear',
 'heard',
 'heart',
 'hearts',
 'held',
 'help',
 'hens',
 'highest',
 'history',
 'hitch',
 'ho',
 'hold',
 'holiday',
 'holly',
 'home',
 'hoof',
 'hop',
 'hope',
 'hopes',
 'horse',
 'hosting',
 'house',
 'hung',
 'hurricane',
 'ing',
 'inspires',
 'jack',
 'jelly',
 'jerk',
 'jingle',
 'jingling',
 'job',
 'jolly',
 'joyous',
 'just',
 'kerchief',
 'kids',
 'kin',
 'knew',
 'knock',
 'know',
 'knows',
 'ladies',
 'lane',
 'lank',
 'later',
 'laugh',
 'laughed',
 'laughing',
 'lawn',
 'laying',
 'lead',
 'lean',
 'leaping',
 'leaves',
 'lend',
 'lesson',
 'let',
 'lie',
 'light',
 'lights',
 'like',
 'list',
 'listen',
 'listening',
 'little',
 'lively',
 'll',
 'loaded',
 'long',
 'looked',
 'lords',
 'lot',
 'lots',
 'love',
 'loved',
 'low',
 'lustre',
 'maids',
 'make',
 'making',
 'mamma',
 'man',
 'married',
 'marshmallows',
 'matter',
 'meadow',
 'meet',
 'merrily',
 'merry',
 'mid',
 'miles',
 'milking',
 'mingle',
 'miniature',
 'misfortune',
 'miss',
 'mister',
 'mistletoe',
 'mistletoeing',
 'mix',
 'moment',
 'monopoly',
 'moon',
 'mother',
 'mount',
 'mouse',
 'mouth',
 'names',
 'nap',
 'naughty',
 'near',
 'nestled',
 'new',
 'nice',
 'nicholas',
 'nick',
 'night',
 'ninety',
 'ninth',
 'nipping',
 'nod',
 'nose',
 'nosed',
 'objects',
 'obstacle',
 'offering',
 'oft',
 'oh',
 'old',
 'olden',
 'ones',
 'open',
 'opening',
 'outside',
 'pack',
 'parson',
 'parties',
 'partridge',
 'party',
 'pawing',
 'pear',
 'peddler',
 'phrase',
 'pick',
 'pie',
 'pinochio',
 'pipe',
 'pipers',
 'piping',
 'place',
 'plans',
 'play',
 'plump',
 'plums',
 'poor',
 'popping',
 'porch',
 'pout',
 'prancer',
 'prancing',
 'presents',
 'pretend',
 'pudding',
 'pumpkin',
 'quick',
 'quickly',
 'rapid',
 'really',
 'recall',
 'red',
 'reindeer',
 'rhyme',
 'ride',
 'riding',
 'right',
 'ring',
 'rings',
 'roasting',
 'rock',
 'rocking',
 'roof',
 'rose',
 'roses',
 'round',
 'rudolph',
 'said',
 'sake',
 'santa',
 'sash',
 'saw',
 'say',
 'scary',
 'season',
 'seated',
 'second',
 'sees',
 'sent',
 'sentimental',
 'settled',
 'seven',
 'seventh',
 'shining',
 'shiny',
 'shook',
 'shouted',
 'shutters',
 'sight',
 'signs',
 'simple',
 'sing',
 'singing',
 'sings',
 'sixth',
 'sky',
 'sleep',
 'sleeping',
 'sleigh',
 'sleighing',
 'slowly',
 'smoke',
 'snow',
 'snowing',
 'snowman',
 'snows',
 'snug',
 'song',
 'soon',
 'soot',
 'speed',
 'spirit',
 'spirits',
 'spite',
 'spoke',
 'sprang',
 'sprawling',
 'spy',
 'square',
 'st',
 'star',
 'stay',
 'steadfast',
 'stirring',
 'stockings',
 'stop',
 'stopping',
 'stories',
 'storm',
 'story',
 'straight',
 'strength',
 'stump',
 'sublime',
 'sugar',
 'summer',
 'sung',
 'swans',
 'swell',
 'swimming',
 'swing',
 'tail',
 'tailed',
 'tales',
 'tarnished',
 'teach',
 'team',
 'teeth',
 'tell',
 'telling',
 'tenth',
 'thistle',
 'thought',
 'threw',
 'thrilling',
 'tide',
 'tidings',
 'tight',
 'time',
 'times',
 'tiny',
 'toasting',
 'tonight',
 'tore',
 'tots',
 'town',
 'toys',
 'tree',
 'treetops',
 'tries',
 'troubles',
 'true',
 'turkey',
 'turned',
 'turning',
 'turtle',
 'twas',
 'twelfth',
 'twice',
 'twinkled',
 'twinkling',
 'twist',
 'unafraid',
 'upsot',
 'used',
 've',
 'visions',
 'vixen',
 'voices',
 'walking',
 'wall',
 'want',
 'warm',
 'watch',
 'way',
 'ways',
 'weather',
 'went',
 'whistle',
 'whistled',
 'white',
 'wild',
 'window',
 'wink',
 'winter',
 'wintertime',
 'wish',
 'won',
 'wonderful',
 'wondering',
 'wonderland',
 'word',
 'work',
 'wreath',
 'write',
 'year',
 'years',
 'yippee',
 'yore',
 'young',
 'yule']
In [9]:
# Check the number of terms
len(cv.get_feature_names())
Out[9]:
550
In [10]:
# View the word counts across all of the documents
word_counts = pd.DataFrame(doc_term_matrix.toarray(), index=df["Carol"], columns=cv.get_feature_names())
word_counts
Out[10]:
aglow ago ain air allow appear arose ashes aside awake ... word work wreath write year years yippee yore young yule
Carol
Have Yourself A Merry Little Christmas 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 1
I'll Be Home For Christmas 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
It's the most wonderful time of the year 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 4 0 0 0 0 0
Jingle Bell Rock 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Jingle Bells 0 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
Let It Snow! 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
O Christmas Tree 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Rocking around the Christmas Tree 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Rudolph The Red-Nosed Reindeer 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
Santa Claus Is Coming To Town 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
The Christmas Song 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
The Night Before Christmas 0 0 0 0 0 1 1 1 1 0 ... 1 1 1 0 0 0 0 0 0 0
The Twelve Days of Christmas 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
We Wish You A Merry Christmas 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 6 0 0 0 0 0
White Christmas 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 2 0 0 0 0 0 0
Winter Wonderland 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

16 rows × 550 columns

In [11]:
# View the most and least frequent words
word_counts.sum().sort_values(ascending=False)
Out[11]:
christmas    55
jingle       31
tree         28
snow         17
ll           16
             ..
objects       1
obstacle      1
offering      1
oft           1
aglow         1
Length: 550, dtype: int64
In [12]:
# View the word counts for certain words
word_counts[["christmas", "jingle"]]
Out[12]:
christmas jingle
Carol
Have Yourself A Merry Little Christmas 3 0
I'll Be Home For Christmas 3 0
It's the most wonderful time of the year 0 1
Jingle Bell Rock 0 18
Jingle Bells 0 12
Let It Snow! 0 0
O Christmas Tree 13 0
Rocking around the Christmas Tree 5 0
Rudolph The Red-Nosed Reindeer 1 0
Santa Claus Is Coming To Town 0 0
The Christmas Song 1 0
The Night Before Christmas 2 0
The Twelve Days of Christmas 12 0
We Wish You A Merry Christmas 10 0
White Christmas 5 0
Winter Wonderland 0 0

Determine the inverse document frequencies (IDFs)

In [13]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 
idfs.fit(doc_term_matrix)
Out[13]:
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
In [14]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 
 
# Sort ascending and display
# High IDF (1/DF) terms are less frequent across all documents; low IDF terms are more frequent 
idfs_df.sort_values(by=['idfs'], ascending=False)
Out[14]:
idfs
aglow 3.140066
maids 3.140066
mount 3.140066
mother 3.140066
moon 3.140066
... ...
happy 2.041454
snow 1.753772
sleigh 1.753772
ll 1.753772
christmas 1.435318

550 rows × 1 columns

Put it all together to calculate the TF-IDFs

In [15]:
# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

Do some more analysis

In [16]:
# Create a data frame to view the TF-IDF scores for the first document, doc = 0
doc = 0
col = "tf-idf for doc {}".format(doc)
tf_idf_doc = pd.DataFrame(tf_idfs[doc].T.todense(), index=cv.get_feature_names(), columns=[col])
tf_idf_doc.sort_values(by=[col], ascending=False)
Out[16]:
tf-idf for doc 0
little 0.400025
troubles 0.306225
merry 0.298630
days 0.266683
christmas 0.209962
... ...
goodies 0.000000
good 0.000000
gonna 0.000000
gone 0.000000
aglow 0.000000

550 rows × 1 columns

In [17]:
# Create a data frame to view all of the TF-IDF scores
tf_idf_all_docs = pd.DataFrame(tf_idfs.T.todense(), index=cv.get_feature_names())
tf_idf_all_docs
Out[17]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
aglow 0.000000 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.145192 0.0 0.0 0.0 0.0 0.000000
ago 0.000000 0.0 0.1085 0.000000 0.094446 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
ain 0.000000 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.076468
air 0.000000 0.0 0.0000 0.049624 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
allow 0.153113 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
years 0.153113 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
yippee 0.000000 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.081955 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
yore 0.153113 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
young 0.000000 0.0 0.0000 0.000000 0.054225 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000
yule 0.133342 0.0 0.0000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.0 0.126444 0.0 0.0 0.0 0.0 0.000000

550 rows × 16 columns

In [18]:
# Nicer if we re-orientate the scores so they're displayed in the same way as the term frequencies at the top
# Use np.transpose to swap array rows and columns
tf_idf_all_docs_nicer = pd.DataFrame(np.transpose(tf_idfs.T.toarray()), index=df["Carol"], columns=cv.get_feature_names())
tf_idf_all_docs_nicer
Out[18]:
aglow ago ain air allow appear arose ashes aside awake ... word work wreath write year years yippee yore young yule
Carol
Have Yourself A Merry Little Christmas 0.000000 0.000000 0.000000 0.000000 0.153113 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.153113 0.000000 0.153113 0.000000 0.133342
I'll Be Home For Christmas 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
It's the most wonderful time of the year 0.000000 0.108500 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.434002 0.000000 0.000000 0.000000 0.000000 0.000000
Jingle Bell Rock 0.000000 0.000000 0.000000 0.049624 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Jingle Bells 0.000000 0.094446 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.054225 0.000000
Let It Snow! 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
O Christmas Tree 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Rocking around the Christmas Tree 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Rudolph The Red-Nosed Reindeer 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.081955 0.000000 0.000000 0.000000
Santa Claus Is Coming To Town 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.091651 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
The Christmas Song 0.145192 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.126444
The Night Before Christmas 0.000000 0.000000 0.000000 0.000000 0.000000 0.055317 0.055317 0.055317 0.055317 0.000000 ... 0.055317 0.055317 0.055317 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
The Twelve Days of Christmas 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
We Wish You A Merry Christmas 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.280458 0.000000 0.000000 0.000000 0.000000 0.000000
White Christmas 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.268166 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Winter Wonderland 0.000000 0.000000 0.076468 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

16 rows × 550 columns

In [19]:
# Even better, let's just display the TF-IDFs for certain words of interest
tf_idf_all_docs_nicer[["christmas", "jingle"]]
Out[19]:
christmas jingle
Carol
Have Yourself A Merry Little Christmas 0.209962 0.000000
I'll Be Home For Christmas 0.373915 0.000000
It's the most wonderful time of the year 0.000000 0.097086
Jingle Bell Rock 0.000000 0.696062
Jingle Bells 0.000000 0.507062
Let It Snow! 0.000000 0.000000
O Christmas Tree 0.472514 0.000000
Rocking around the Christmas Tree 0.329465 0.000000
Rudolph The Red-Nosed Reindeer 0.037462 0.000000
Santa Claus Is Coming To Town 0.000000 0.000000
The Christmas Song 0.066367 0.000000
The Night Before Christmas 0.050571 0.000000
The Twelve Days of Christmas 0.127103 0.000000
We Wish You A Merry Christmas 0.245341 0.000000
White Christmas 0.306446 0.000000
Winter Wonderland 0.000000 0.000000

Now prepare a search query

In [20]:
# Now let's perform a simple query that looks for the following words
query = "red rocking reindeer"

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query])
In [21]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

# Query term counts, showing all terms within the documents
# query_counts

# Query term counts, showing just the query terms (shows what we know already of course)
query_counts[query.split(" ")]
Out[21]:
red rocking reindeer
0 1 1 1

Calculate the cosine similarity between the TF-IDFs and the query words

In [22]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results
Out[22]:
array([[0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.24968423],
       [0.42648217],
       [0.        ],
       [0.06532237],
       [0.02488732],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ]])
In [23]:
results = results.reshape((-1,))
results
Out[23]:
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24968423, 0.42648217, 0.        ,
       0.06532237, 0.02488732, 0.        , 0.        , 0.        ,
       0.        ])

Show the results

In [24]:
# Print the top search results - voila, hopefully!
# Dan's note to self:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-11:-1] returns the last 10 items, in reverse order
print("Search results for: '{}'".format(query))
for i in results.argsort()[:-11:-1]:
    if results[i] > 0:
        print("Carol {}. {} {}%".format(i, df.iloc[i,0], round(100*results[i])))
Search results for: 'red rocking reindeer'
Carol 8. Rudolph The Red-Nosed Reindeer 43.0%
Carol 7. Rocking around the Christmas Tree 25.0%
Carol 10. The Christmas Song 7.0%
Carol 11. The Night Before Christmas 2.0%