In [1]:
#import dependancies
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as tfhub
import sqlite3
from sqlite3 import Error
import os

#force output to display the full description
pd.set_option('display.max_colwidth', -1)
In [2]:
#connect to the database
conn = sqlite3.connect('db\wine_data.sqlite')
c = conn.cursor()
In [3]:
#create a dataframe by querying the database
wine_df = pd.read_sql('Select * from wine_data', conn)
In [14]:
#check the dataframe
wine_df.head(1)
Out[14]:
level_0 index country description rating price province title variety winery color countryID varietyID colorID provinceID wineryID
0 0 0 Portugal This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016. 87 15.0 Douro Quinta dos Avidagos 2011 Avidagos Red (Douro) Portuguese Red Quinta dos Avidagos red 0 0 0 0 0
In [4]:
#check the shape
wine_df.shape
Out[4]:
(100228, 16)
In [5]:
#create the directory in which to cache the tenserflow universal sentence encoder
os.environ["TFHUB_CACHE_DIR"] = 'C:/Users/bendgame/Downloads'
In [15]:
#use if the file cannot be downloaded to the local machine. replace the local path with the web path below.
#used to download the Universal sentence encoder onto the local machine
#embed = tfhub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
In [ ]:
#embed = tfhub.Module("C:/Users/Administrator/Downloads/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47")
In [9]:
#create the tenserflow graph using placeholder to reduce overhead and time it takes to cache the wine descriptions.
#huge performance improvements doing it this way.
g=tf.Graph()
with g.as_default():
    text_input = tf.placeholder(dtype = tf.string, shape=[None])
    embed = tfhub.Module("C:/Users/bendgame/Downloads/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47")
    em_txt = embed(text_input)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
#g.finalize()

session = tf.Session(graph = g)
session.run(init_op)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
I0803 22:15:37.622714 14160 saver.py:1483] Saver not created because there are no variables in the graph to restore
In [10]:
#embed the wine descriptions by passing the data through the encoder
result = session.run(em_txt, feed_dict={text_input:list(wine_df.description)})
In [16]:
#create a function 
def recommend_engine(query, color, embedding_table = result):
    '''
    takes user query, wine color, and embedded descriptions. Encodes the user query 
    and uses the dot product (calculated using numpy) to calculate the similarity 
    between the description and user query.
    '''
    
    # Embed user query
    with tf.Session(graph = g) as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embedding = session.run(embed([query]))

    # Calculate similarity with all reviews
    similarity_score = np.dot(embedding, embedding_table.T)
    
    #used to calculate the cosine similarty 
    #     norm_a = np.linalg.norm(embedding)
    #     norm_b = np.linalg.norm(embedding_table.T)
    #     similarity_score = dot/(norm_a * norm_b)
   
    recommendations = wine_df.copy()
    recommendations['recommendation'] = similarity_score.T
    recommendations = recommendations.sort_values('recommendation', ascending=False)
    
    #filter through the dataframe to find the corresponding wine color records.
    if (color == 'red'):
        recommendations = recommendations.loc[(recommendations.color =='red')] 
        recommendations = recommendations[['variety', 'title', 'price', 'description', 'recommendation'
                                       , 'rating','color']]
    elif(color == "white"):
        recommendations = recommendations.loc[(recommendations.color =='white')] 
        recommendations = recommendations[['variety', 'title', 'price', 'description', 'recommendation'
                                       , 'rating','color']]
    elif(color == "other"):
        recommendations = recommendations.loc[(recommendations.color =='other')] 
        recommendations = recommendations[['variety', 'title', 'price', 'description', 'recommendation'
                                       , 'rating','color']]
    else:
        recommendations = recommendations[['variety', 'title', 'price', 'description', 'recommendation'
                                       , 'rating','color']]
    #returns dataframe
    return recommendations
In [18]:
query = "fruity, rich, easy to drink, sweet"
color = 'red'

recommendation = recommend_engine(query, color)
print(query)

recommendation.head(3).T
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
I0803 22:45:12.076908 14160 saver.py:1483] Saver not created because there are no variables in the graph to restore
fruity, rich, easy to drink, sweet
Out[18]:
24956 28115 18222
variety Syrah Cabernet Sauvignon Cabernet Sauvignon
title Dierberg 2008 Syrah (Santa Ynez Valley) Middle Sister NV Mischief Maker Cabernet Sauvignon (California) Kirkland Signature 2009 Cabernet Sauvignon (Alexander Valley)
price 34 12 9
description A nice Syrah, rich in berries, licorice, Dr. Pepper cola and spices, with soft, luxurious tannins. It's a little uncomplicated and sweet, but easy to drink now. With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink. Fruity in blackberry and cherry flavors, this is easy to drink, with a good acid-tannin balance.
recommendation 0.808436 0.805091 0.782239
rating 86 85 84
color red red red
In [ ]: