import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols


coffee_columns = ['acid', 'body', 'flavor', 'aftertaste', 'desc_1']
analysis_columns = ['loc_country', 'roast', '100g_USD', 'rating', 'desc_1']

coffee_df = pd.read_csv('data/coffee_df.csv', usecols=coffee_columns)
analysis_df = pd.read_csv('data/coffee_analysis.csv', usecols=analysis_columns)

df = pd.merge(coffee_df, analysis_df, on='desc_1', how='inner')


df.drop(['desc_1'], axis=1, inplace=True)
df.dropna(inplace=True)
df['roast'] = df['roast'].str.replace('-', '_')

dtypes = {
    'acid': int,
    'body': int,
    'flavor': int,
    'aftertaste': int,
    'roast': 'category',
    'loc_country': 'category',
    '100g_USD': float,
    'rating': int,
}

df = df.astype(dtypes)


print("Dimensions", df.shape)
print('Data Types: \n', df.dtypes)
df.head(10)

Dimensions (1779, 8)
Data Types: 
 acid              int64
body              int64
flavor            int64
aftertaste        int64
roast          category
loc_country    category
100g_USD        float64
rating            int64
dtype: object


roast_types = ['Light', 'Medium_Light', 'Medium', 'Medium_Dark', 'Dark']
df['roast'] = pd.Categorical(df['roast'], categories=roast_types, ordered=True)


description = df.describe().loc[['mean', 'std', 'min', 'max']]
skewness = df.skew().rename('skew')
description = description.append(skewness)
description

/var/folders/rq/9zvkr67d2mx07n6md37__xcr0000gn/T/ipykernel_40425/1274467858.py:2: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  skewness = df.skew().rename('skew')
/var/folders/rq/9zvkr67d2mx07n6md37__xcr0000gn/T/ipykernel_40425/1274467858.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  description = description.append(skewness)


ordinal_features = ['acid', 'body', 'flavor', 'aftertaste', '100g_USD', 'rating']
features_bins = [4, 4, 4, 5, 15, 15]

fig, axs = plt.subplots(nrows=6, ncols=1, figsize=(8, 25))
fig.subplots_adjust(hspace=0.25)

for i, column in enumerate(ordinal_features):
    axs[i].hist(df[column], bins=features_bins[i], color='blue', alpha=0.7)

    axs[i].set_title(f'Distribution of {column}')
    axs[i].set_ylabel('Frequency')
    axs[i].set_xlabel(column)

plt.savefig("images/hist.svg")


fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(8, 25), sharex=True)

for ax, feature in zip(axes, ordinal_features[:-1]):
    sns.boxplot(x='rating', y=feature, data=df, ax=ax, showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black"}, showfliers=False)
    ax.set_title(f'Box Plot of {feature} by rating')
    ax.set_xlabel('rating')
    ax.set_ylabel(feature)

plt.savefig("images/box_plot.svg")


corr_matrix = df[ordinal_features].corr()
corr_matrix.style.background_gradient(cmap='coolwarm')


categorical_features = ['roast', 'loc_country']

for feature in categorical_features:
    values = df[feature].value_counts()
    print(f"Number of {feature} Cats:", len(values)) 
    print(values, '\n')

Number of roast Cats: 5
Medium_Light    1304
Light            280
Medium           175
Medium_Dark       16
Dark               4
Name: roast, dtype: int64 

Number of loc_country Cats: 17
United States              1210
Taiwan                      402
Hawai'i                      82
Guatemala                    27
Canada                       21
Hong Kong                    12
Japan                        11
Peru                          2
China                         2
England                       2
Mexico                        2
Uganda                        1
Australia                     1
New Taiwan                    1
Kenya                         1
Honduras                      1
United States And Floyd       1
Name: loc_country, dtype: int64


plt.figure(figsize=(8, 6))
sns.boxplot(x='roast', y='rating', data=df, showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black"}, showfliers=False)
plt.title('Box Plot of Ratings by Roast Category')
plt.xlabel('Roast Category')
plt.ylabel('Rating')

plt.savefig("images/box_plot_roast.svg")


plt.figure(figsize=(18, 6))
sns.boxplot(x='loc_country', y='rating', data=df, showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black"}, showfliers=False)
plt.title('Box Plot of Ratings by Country Category')
plt.xlabel('Roast Category')
plt.ylabel('Rating')

plt.savefig("images/box_plot_loc.svg")


print("Test for Normality:")
for column in ordinal_features:
    stat, p = stats.shapiro(df[column])
    print(f"    {column:12s}     stat: {stat:.3f}     p-value: {p:.3e}")

print("\n Test for Uniformity:") 
for column in ordinal_features:
    stat, p = stats.kstest(df[column], 'uniform')
    print(f"    {column:12s}     stat: {stat:.3f}     p-value: {p:.3e}")

Test for Normality:
    acid             stat: 0.699     p-value: 0.000e+00
    body             stat: 0.632     p-value: 0.000e+00
    flavor           stat: 0.427     p-value: 0.000e+00
    aftertaste       stat: 0.649     p-value: 0.000e+00
    100g_USD         stat: 0.485     p-value: 0.000e+00
    rating           stat: 0.920     p-value: 1.091e-29

 Test for Uniformity:
    acid             stat: 1.000     p-value: 0.000e+00
    body             stat: 1.000     p-value: 0.000e+00
    flavor           stat: 1.000     p-value: 0.000e+00
    aftertaste       stat: 1.000     p-value: 0.000e+00
    100g_USD         stat: 0.999     p-value: 0.000e+00
    rating           stat: 1.000     p-value: 0.000e+00


df['100g_USD'], fitted_lambda = stats.boxcox(df['100g_USD'])
print("Lambda for 100g_USD:", round(fitted_lambda, 3))

stat, p = stats.shapiro(df['100g_USD'])
print(f"stat: {stat:.3f}     p-value: {p:.3e}")

df['100g_USD'].plot(kind='hist', bins=15, alpha=0.7, color='blue')

Lambda for 100g_USD: -0.308
stat: 0.891     p-value: 9.273e-34

<Axes: ylabel='Frequency'>


X = df[['acid', 'body', 'flavor', 'aftertaste', '100g_USD']]
X = sm.add_constant(X)  
y = df['rating']

model = sm.OLS(y, X).fit()
model.summary()


X = df[['acid', 'body', 'flavor', 'aftertaste']]
X = sm.add_constant(X)  
y = df['rating']

model = sm.OLS(y, X).fit()
model.summary()


design = pd.get_dummies(df[['roast', 'rating']], columns=['roast'])
design = sm.add_constant(design)
design.head(10)


model = ols('rating ~ roast_Light + roast_Medium_Light + roast_Medium', data=design).fit()
model.summary()


X = pd.get_dummies(df[['acid', 'body', 'flavor', 'aftertaste', 'roast']], columns=['roast'], drop_first=False)
X.drop(['roast_Medium_Dark', 'roast_Dark'], axis=1, inplace=True)
X = sm.add_constant(X) 

model = sm.OLS(y, X).fit()
model.summary()


!jupyter nbconvert --to html final.ipynb --output modeling_coffee_reviews

[NbConvertApp] Converting notebook final.ipynb to html
[NbConvertApp] Writing 993725 bytes to modeling_coffee_reviews.html

	acid	body	flavor	aftertaste	100g_USD	rating
mean	8.516020	8.637437	8.981450	8.110736	9.522327	93.100056
std	0.547154	0.496980	0.327230	0.479727	10.935001	1.578859
min	6.000000	7.000000	7.000000	6.000000	0.170000	84.000000
max	10.000000	10.000000	10.000000	9.000000	132.280000	98.000000
skew	-0.475956	-0.736499	-0.745512	0.237914	5.416931	-0.947961

	acid	body	flavor	aftertaste	100g_USD	rating
acid	1.000000	0.303699	0.373901	0.450708	0.178089	0.758571
body	0.303699	1.000000	0.304462	0.312396	0.174092	0.651938
flavor	0.373901	0.304462	1.000000	0.392868	0.234329	0.669822
aftertaste	0.450708	0.312396	0.392868	1.000000	0.250361	0.722725
100g_USD	0.178089	0.174092	0.234329	0.250361	1.000000	0.280223
rating	0.758571	0.651938	0.669822	0.722725	0.280223	1.000000

Dep. Variable:	rating	R-squared:	0.954
Model:	OLS	Adj. R-squared:	0.953
Method:	Least Squares	F-statistic:	7277.
Date:	Sat, 04 May 2024	Prob (F-statistic):	0.00
Time:	14:57:53	Log-Likelihood:	-606.38
No. Observations:	1779	AIC:	1225.
Df Residuals:	1773	BIC:	1258.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	52.8182	0.243	217.101	0.000	52.341	53.295
acid	1.1568	0.017	66.767	0.000	1.123	1.191
body	1.0814	0.018	60.724	0.000	1.047	1.116
flavor	1.3906	0.028	49.168	0.000	1.335	1.446
aftertaste	1.0522	0.020	52.384	0.000	1.013	1.092
100g_USD	0.0452	0.027	1.667	0.096	-0.008	0.098

Omnibus:	316.516	Durbin-Watson:	1.855
Prob(Omnibus):	0.000	Jarque-Bera (JB):	599.230
Skew:	-1.078	Prob(JB):	7.57e-131
Kurtosis:	4.854	Cond. No.	521.

Modeling Coffee Reviews with Linear Model¶

Will Rodman | Final Project | MATH-6040 Linear Models¶

Sprint 1: Loading and cleaning the dataset.¶

Sprint 2: Visualizing numeric features.¶

Sprint 3: Visualizing the categorical features.¶

Sprint 4: Transforming and testing numerical features.¶

Sprint 5: Fitting Linear Regression¶

Model 1:¶

Model 2:¶

Model 3:¶

Model 4:¶

	acid	body	flavor	aftertaste	roast	loc_country	100g_USD	rating
0	9	9	9	8	Medium_Light	United States	12.93	94
1	9	9	9	8	Medium_Light	United States	6.17	93
2	9	8	9	8	Medium_Light	United States	5.58	92
3	8	8	9	8	Medium_Light	United States	9.17	92
4	8	9	9	8	Medium_Light	Taiwan	8.80	92
5	8	8	9	8	Light	Taiwan	6.08	92
6	9	8	9	7	Medium_Light	United States	5.88	91
7	9	8	9	7	Medium_Light	United States	5.88	91
8	9	9	9	9	Light	United States	13.23	95
9	9	9	9	8	Light	United States	8.11	94

Omnibus:	319.244	Durbin-Watson:	1.859
Prob(Omnibus):	0.000	Jarque-Bera (JB):	608.524
Skew:	-1.083	Prob(JB):	7.25e-133
Kurtosis:	4.876	Cond. No.	509.

	const	rating	roast_Light	roast_Medium_Light
0	1.0	94	0	1
1	1.0	93	0	1
2	1.0	92	0	1
3	1.0	92	0	1
4	1.0	92	0	1
5	1.0	92	1	0
6	1.0	91	0	1
7	1.0	91	0	1
8	1.0	95	1	0
9	1.0	94	1	0

Omnibus:	111.169	Durbin-Watson:	1.464
Prob(Omnibus):	0.000	Jarque-Bera (JB):	211.523
Skew:	-0.439	Prob(JB):	1.17e-46
Kurtosis:	4.443	Cond. No.	24.2

Omnibus:	322.890	Durbin-Watson:	1.858
Prob(Omnibus):	0.000	Jarque-Bera (JB):	631.879
Skew:	-1.081	Prob(JB):	6.16e-138
Kurtosis:	4.961	Cond. No.	524.

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	52.7393	0.239	220.886	0.000	52.271	53.208
acid	1.1597	0.017	67.233	0.000	1.126	1.194
body	1.0847	0.018	61.238	0.000	1.050	1.119
flavor	1.3963	0.028	49.709	0.000	1.341	1.451
aftertaste	1.0572	0.020	53.220	0.000	1.018	1.096

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	89.4500	0.330	271.085	0.000	88.803	90.097
roast_Light	4.0857	0.342	11.962	0.000	3.416	4.756
roast_Medium_Light	3.7663	0.332	11.327	0.000	3.114	4.418
roast_Medium	2.5043	0.348	7.190	0.000	1.821	3.187

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	52.7934	0.242	218.353	0.000	52.319	53.268
acid	1.1372	0.017	65.332	0.000	1.103	1.171
body	1.0760	0.018	61.270	0.000	1.042	1.110
flavor	1.3857	0.028	49.811	0.000	1.331	1.440
aftertaste	1.0457	0.020	53.041	0.000	1.007	1.084
roast_Light	0.4303	0.081	5.327	0.000	0.272	0.589
roast_Medium_Light	0.4161	0.078	5.314	0.000	0.263	0.570
roast_Medium	0.2918	0.081	3.621	0.000	0.134	0.450

	const	rating	roast_Light	roast_Medium_Light
0	1.0	94	0	1
1	1.0	93	0	1
2	1.0	92	0	1
3	1.0	92	0	1
4	1.0	92	0	1
5	1.0	92	1	0
6	1.0	91	0	1
7	1.0	91	0	1
8	1.0	95	1	0
9	1.0	94	1	0

	const	rating	roast_Light	roast_Medium_Light
0	1.0	94	0	1
1	1.0	93	0	1
2	1.0	92	0	1
3	1.0	92	0	1
4	1.0	92	0	1
5	1.0	92	1	0
6	1.0	91	0	1
7	1.0	91	0	1
8	1.0	95	1	0
9	1.0	94	1	0