In [65]:
#import dependencies
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as tfhub
import sqlite3
from sqlite3 import Error

pd.set_option('display.max_colwidth', -1)
In [ ]:
#read the csv
wine_data = pd.read_csv("winemag-data-130k-v2.csv")
In [66]:
#create a sqlite database file and a connection
conn = sqlite3.connect('db\wine_data.sqlite')
c = conn.cursor()
In [ ]:
#creat a table in the database
wine_data.to_sql('wine_data', conn, if_exists = "replace")
#read the table in the database
wine_df = pd.read_sql('Select * from wine_data', conn)
In [ ]:
#Drop the duplicate descriptions
wine_df = wine_df.drop_duplicates('description')

#drop null prices
wine_df = wine_df.dropna(subset=['price'])

#filter the dataframe to include only varieties with more than 200 reviews
wine_df = wine_df.groupby('variety').filter(lambda x: len(x) > 200)

#create a column named color
wine_df["color"] = ""

#update the database
wine_df.to_sql('wine_data', conn, if_exists = "replace")
In [ ]:
pd.read_sql('SELECT variety, count(variety) as vcount FROM wine_data group by variety', conn)
In [ ]:
#used to update the database with the wine color. Maunally updated each wine variety.
c.execute("update wine_data set color = 'red' where variety = 'Aglianico'  ")

#commit the update to the database so it saves
conn.commit()
In [ ]:
#verify data saved as expected
pd.read_sql("select color, count(color) from wine_data group by color" ,conn)
In [ ]:
#remove all the records without a color
wine_df = pd.read_sql("select country, description,rating,price,province,title,variety,winery, color  from wine_data where color in ('red', 'white', 'other')", conn)
wine_df.to_sql('wine_data', conn, if_exists = "replace")
wine_df.info()
In [ ]:
#drop the rest of the NA values
wine_df = wine_df.dropna()
wine_df.info()
In [ ]:
#update the database
wine_df.to_sql('wine_data', conn, if_exists = "replace")
In [68]:
#verify the data
wine_df.head(2)
Out[68]:
index country description rating price province title variety winery color
0 0 Portugal This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016. 87 15.0 Douro Quinta dos Avidagos 2011 Avidagos Red (Douro) Portuguese Red Quinta dos Avidagos red
1 1 US Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented. 87 14.0 Oregon Rainstorm 2013 Pinot Gris (Willamette Valley) Pinot Gris Rainstorm white
In [69]:
#create the IDs to be used as numeric labels in the machine learning models
country_df = pd.read_sql('select distinct country from wine_data', conn)
country_df.reset_index(inplace = True)
country_df =country_df.rename(columns = {'index':'countryID',
                            'country':'country'})
country_df.head()
Out[69]:
countryID country
0 0 Portugal
1 1 US
2 2 France
3 3 Germany
4 4 Argentina
In [70]:
#merge the dataframes
wine_df = wine_df.merge(country_df, how = 'inner', on ='country')
In [71]:
#create the IDs to be used as numeric labels in the machine learning models
variety_df = pd.read_sql('select distinct variety from wine_data', conn)
variety_df.reset_index(inplace = True)
variety_df =variety_df.rename(columns = {'index':'varietyID',
                            'variety':'variety'})
variety_df.head()
wine_df = wine_df.merge(variety_df, how = 'inner', on ='variety')
In [72]:
#create the IDs to be used as numeric labels in the machine learning models
color_df = pd.read_sql('select distinct color from wine_data', conn)
color_df.reset_index(inplace = True)
color_df =color_df.rename(columns = {'index':'colorID',
                            'color':'color'})
color_df.head()
wine_df = wine_df.merge(color_df, how = 'inner', on ='color')
In [73]:
#create the IDs to be used as numeric labels in the machine learning models
province_df = pd.read_sql('select distinct province from wine_data', conn)
province_df.reset_index(inplace = True)
province_df =province_df.rename(columns = {'index':'provinceID',
                            'province':'province'})
province_df.head()
wine_df = wine_df.merge(province_df, how = 'inner', on ='province')
In [75]:
#create the IDs to be used as numeric labels in the machine learning models
winery_df = pd.read_sql('select distinct winery from wine_data', conn)
winery_df.reset_index(inplace = True)
winery_df =winery_df.rename(columns = {'index':'wineryID',
                            'winery':'winery'})
winery_df.head()
wine_df = wine_df.merge(winery_df, how = 'inner', on ='winery')
In [76]:
#verify all IDs have been created
wine_df.tail(2)
Out[76]:
index country description rating price province title variety winery color countryID varietyID colorID provinceID wineryID
100226 85115 US Light-salmon in color, this Hawaiian sparkler surprises with ripe raspberry and white chocolate aromas. Vigorous bubbles clean up the strawberries-and-cream palate, enticing the drinker for yet another sip. Enjoy with creamy strawberry cheesecake. 87 28.0 Hawaii Ulupalakua NV Lokelani Sparkling Rosé Sparkling (Hawaii) Sparkling Blend Ulupalakua other 1 43 2 358 14227
100227 43274 US A full-flavored but light-bodied sparkler that has some toasty elements over lemon-tangerine fruit. The long finish brings some attractive pink grapefruit notes. A very welcome surprise from this Rhode Island winery. 90 25.0 Rhode Island Sakonnet 1995 Samson Brut (Southeastern New England) Champagne Blend Sakonnet other 1 21 2 322 11123
In [77]:
#update the dataframe with the new columns
wine_df.to_sql('wine_data', conn, if_exists = "replace")
In [78]:
#check the info
wine_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100228 entries, 0 to 100227
Data columns (total 15 columns):
index          100228 non-null int64
country        100228 non-null object
description    100228 non-null object
rating         100228 non-null int64
price          100228 non-null float64
province       100228 non-null object
title          100228 non-null object
variety        100228 non-null object
winery         100228 non-null object
color          100228 non-null object
countryID      100228 non-null int64
varietyID      100228 non-null int64
colorID        100228 non-null int64
provinceID     100228 non-null int64
wineryID       100228 non-null int64
dtypes: float64(1), int64(7), object(7)
memory usage: 12.2+ MB
In [ ]: