import pandas as pd
df = pd.read_csv('zomato.csv')
df.head(2)
Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Currency | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 |
1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 |
2 rows × 21 columns
## Checking if dataset contains any null
nan_values = df.isna()
nan_columns = nan_values.any()
columns_with_nan = df.columns[nan_columns].tolist()
print(columns_with_nan)
['Cuisines']
df1 = pd.read_excel('Country-Code.xlsx')
df1.head()
Country Code | Country | |
---|---|---|
0 | 1 | India |
1 | 14 | Australia |
2 | 30 | Brazil |
3 | 37 | Canada |
4 | 94 | Indonesia |
Let us merge both the datasets. This will help us to understand the dataset country wise.
df2 = pd.merge(df,df1,on='Country Code',how='left')
df2.head(2)
Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | Country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 | Phillipines |
1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 | Phillipines |
2 rows × 22 columns
print('List of counteris the survey is spread accross - ')
for x in pd.unique(df2.Country): print(x)
print()
print('Total number to country', len(pd.unique(df2.Country)))
List of counteris the survey is spread accross - Phillipines Brazil United States Australia Canada Singapore UAE India Indonesia New Zealand United Kingdom Qatar South Africa Sri Lanka Turkey Total number to country 15
from plotly.offline import init_notebook_mode, plot, iplot
labels = list(df2.Country.value_counts().index)
values = list(df2.Country.value_counts().values)
fig = {
"data":[
{
"labels" : labels,
"values" : values,
"hoverinfo" : 'label+percent',
"domain": {"x": [0, .9]},
"hole" : 0.6,
"type" : "pie",
"rotation":120,
},
],
"layout": {
"title" : "Zomato's Presence around the World",
"annotations": [
{
"font": {"size":20},
"showarrow": True,
"text": "Countries",
"x":0.2,
"y":0.9,
},
]
}
}
iplot(fig)
df3 = df2.groupby(['Aggregate rating','Rating color', 'Rating text']).size().reset_index().rename(columns={0:'Rating Count'})
df3
df3
Aggregate rating | Rating color | Rating text | Rating Count | |
---|---|---|---|---|
0 | 0.0 | White | Not rated | 2148 |
1 | 1.8 | Red | Poor | 1 |
2 | 1.9 | Red | Poor | 2 |
3 | 2.0 | Red | Poor | 7 |
4 | 2.1 | Red | Poor | 15 |
5 | 2.2 | Red | Poor | 27 |
6 | 2.3 | Red | Poor | 47 |
7 | 2.4 | Red | Poor | 87 |
8 | 2.5 | Orange | Average | 110 |
9 | 2.6 | Orange | Average | 191 |
10 | 2.7 | Orange | Average | 250 |
11 | 2.8 | Orange | Average | 315 |
12 | 2.9 | Orange | Average | 381 |
13 | 3.0 | Orange | Average | 468 |
14 | 3.1 | Orange | Average | 519 |
15 | 3.2 | Orange | Average | 522 |
16 | 3.3 | Orange | Average | 483 |
17 | 3.4 | Orange | Average | 498 |
18 | 3.5 | Yellow | Good | 480 |
19 | 3.6 | Yellow | Good | 458 |
20 | 3.7 | Yellow | Good | 427 |
21 | 3.8 | Yellow | Good | 400 |
22 | 3.9 | Yellow | Good | 335 |
23 | 4.0 | Green | Very Good | 266 |
24 | 4.1 | Green | Very Good | 274 |
25 | 4.2 | Green | Very Good | 221 |
26 | 4.3 | Green | Very Good | 174 |
27 | 4.4 | Green | Very Good | 144 |
28 | 4.5 | Dark Green | Excellent | 95 |
29 | 4.6 | Dark Green | Excellent | 78 |
30 | 4.7 | Dark Green | Excellent | 42 |
31 | 4.8 | Dark Green | Excellent | 25 |
32 | 4.9 | Dark Green | Excellent | 61 |
The above information helps us to understand the realation between Aggregate rating, color and text. We conclude the following color assigned to the ratings:
Let us try to understand the spread of rating across restaurants
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
plt.figure(figsize=(12,6))
# plt.xticks(rotation=75)
plt.title('Rating Color')
sns.barplot(x=df3['Rating color'], y=df3['Rating Count']);
Interesting, Maximum restaurants seems to have gone No ratings. Let us check if these restaurants belong to some specific country.
No_rating = df2[df2['Rating color']=='White'].groupby('Country').size().reset_index().rename(columns={0:'Rating Count'})
No_rating
Country | Rating Count | |
---|---|---|
0 | Brazil | 5 |
1 | India | 2139 |
2 | United Kingdom | 1 |
3 | United States | 3 |
India seems to have maximum unrated restaurants. In India the culture of ordering online food is still gaining momentum hence most of the restaurants are still unrated on Zomato as people might be preferring to visiting the restaurant for a meal.
country_currency = df2[['Country','Currency']].groupby(['Country','Currency']).size().reset_index(name='count').drop('count', axis=1, inplace=False)
country_currency.sort_values('Currency').reset_index(drop=True)
Country | Currency | |
---|---|---|
0 | Phillipines | Botswana Pula(P) |
1 | Brazil | Brazilian Real(R$) |
2 | Australia | Dollar($) |
3 | Canada | Dollar($) |
4 | Singapore | Dollar($) |
5 | United States | Dollar($) |
6 | UAE | Emirati Diram(AED) |
7 | India | Indian Rupees(Rs.) |
8 | Indonesia | Indonesian Rupiah(IDR) |
9 | New Zealand | NewZealand($) |
10 | United Kingdom | Pounds(��) |
11 | Qatar | Qatari Rial(QR) |
12 | South Africa | Rand(R) |
13 | Sri Lanka | Sri Lankan Rupee(LKR) |
14 | Turkey | Turkish Lira(TL) |
Above table display country and the currency they accept. Interestingly four countries seems to be accepting currency in dollars.
plt.figure(figsize=(12,6))
plt.title('Online Delivery Distribution')
plt.pie(df2['Has Online delivery'].value_counts()/9551*100, labels=df2['Has Online delivery'].value_counts().index, autopct='%1.2f%%', startangle=180);
Only 25% of restaurants accepts online delivery. This data might be biased as we have maximum restaurants listed here are from India. Maybe analysis over city wise would be more helpful.
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
plt.figure(figsize=(12,6))
# import plotly.plotly as py
labels = list(df2.City.value_counts().head(20).index)
values = list(df2.City.value_counts().head(20).values)
fig = {
"data":[
{
"labels" : labels,
"values" : values,
"hoverinfo" : 'label+percent',
"domain": {"x": [0, .9]},
"hole" : 0.5,
"type" : "pie",
"rotation":120,
},
],
"layout": {
"title" : "Zomato's Presence Citywise",
"annotations": [
{
"font": {"size":20},
"showarrow": True,
"text": "Cities",
"x":0.2,
"y":0.9,
},
]
}
}
iplot(fig);
<Figure size 864x432 with 0 Axes>
The data seems to be skewed towards New Delhi, Gurgaon and Noida. I see minimal data for other cities. Hence I would do my analysis predominantly on New Delhi.
We’ve already gained several insights about the restaurants present in the survey. Let’s ask some specific questions and try to answer them using data frame operations and visualizations.
Delhi = df2[(df2.City == 'New Delhi')]
plt.figure(figsize=(12,6))
sns.barplot(x=Delhi.Locality.value_counts().head(10), y=Delhi.Locality.value_counts().head(10).index)
plt.ylabel(None);
plt.xlabel('Number of Resturants')
plt.title('Resturants Listing on Zomato');
Connaught place seems to have high no of restaurants registered with Zomato, Let us understand the cuisines the top rated restaurants has to offer
# I achieve this by the following steps
## Fetching the resturants having 'Excellent' and 'Very Good' rating
ConnaughtPlace = Delhi[(Delhi.Locality.isin(['Connaught Place'])) & (Delhi['Rating text'].isin(['Excellent','Very Good']))]
ConnaughtPlace = ConnaughtPlace.Cuisines.value_counts().reset_index()
## Extracing all the cuisens in a single list
cuisien = []
for x in ConnaughtPlace['index']:
cuisien.append(x)
# cuisien = '[%s]'%', '.join(map(str, cuisien))
cuisien
['North Indian, Chinese, Italian, Continental', 'North Indian, Chinese, Italian, American, Middle Eastern', 'Biryani, North Indian, Hyderabadi', 'North Indian, European', 'Cafe', 'Ice Cream', 'Continental, Mediterranean, Italian, North Indian', 'North Indian, Afghani, Mughlai', 'Chinese', 'North Indian', 'Continental, North Indian, Italian, Asian', 'Asian, North Indian', 'North Indian, Continental', 'North Indian, Italian, Asian, American', 'Bakery, Desserts, Fast Food', 'North Indian, European, Asian, Mediterranean', 'Continental, Mexican, Burger, American, Pizza, Tex-Mex', 'Continental, Italian, Asian, Indian', 'North Indian, Chinese', 'Modern Indian', 'Continental, North Indian, Chinese, Mediterranean', 'Italian, Mexican, Continental, North Indian, Finger Food', 'South Indian', 'Biryani, Hyderabadi', 'Fast Food, American, Burger', 'North Indian, Chinese, Continental, Italian', 'Continental, American, Asian, North Indian', 'Japanese', 'Healthy Food, Continental, Italian', 'Bakery, Fast Food, Desserts', 'North Indian, Mediterranean, Asian, Fast Food']
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
comment_words = ''
stopwords = set(STOPWORDS)
# iterate through the csv file
for val in cuisien:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 1500, height = 1500,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = 'orange')
plt.title('Resturants cuisien - Top Resturants')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
Top rated restaurants seems to be doing well in the following cuisine
top_locality = Delhi.Locality.value_counts().head(10)
sns.set_theme(style="darkgrid")
plt.figure(figsize=(12,6))
ax = sns.countplot(y= "Locality", hue="Has Online delivery", data=Delhi[Delhi.Locality.isin(top_locality.index)])
plt.title('Resturants Online Delivery');
plt.figure(figsize=(12,6))
sns.scatterplot(x="Average Cost for two", y="Aggregate rating", hue='Price range', data=Delhi)
plt.xlabel("Average Cost for two")
plt.ylabel("Aggregate rating")
plt.title('Rating vs Cost of Two');
I observe there is no linear relation between price and rating. For instance, Restaurants with good rating (like 4–5) have restaurants with all the price range and spread across the entire X axis
Delhi['Rating text'].value_counts()
Average 2495 Not rated 1425 Good 1128 Very Good 300 Poor 97 Excellent 28 Name: Rating text, dtype: int64
import plotly.express as px
Highly_rated = Delhi[Delhi['Rating text'].isin(['Excellent'])]
fig = px.scatter_mapbox(Highly_rated, lat="Latitude", lon="Longitude", hover_name="City", hover_data=["Aggregate rating", "Restaurant Name"],
color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(title='Highle rated Resturants Location',
autosize=True,
hovermode='closest',
showlegend=False)
fig.update_layout(
autosize=False,
width=1000,
height=500,)
fig.show()
The aforementioned four cities represent nearly 65% of the total data available in the dataset. Apart from the higly rated local restaurants, it’d be intersting to know where the known-eateries that are commonplace. The verticles across which these can be located are -
types = {
"Breakfast and Coffee" : ["Cafe Coffee Day", "Starbucks", "Barista", "Costa Coffee", "Chaayos", "Dunkin' Donuts"],
"American": ["Domino's Pizza", "McDonald's", "Burger King", "Subway", "Dunkin' Donuts", "Pizza Hut"],
"Ice Creams and Shakes": ["Keventers", "Giani", "Giani's", "Starbucks", "Baskin Robbins", "Nirula's Ice Cream"]
}
breakfast = Delhi[Delhi['Restaurant Name'].isin(types['Breakfast and Coffee'])]
american = Delhi[Delhi['Restaurant Name'].isin(types['American'])]
ice_cream = Delhi[Delhi['Restaurant Name'].isin(types['Ice Creams and Shakes'])]
breakfast = breakfast[['Restaurant Name','Aggregate rating']].groupby('Restaurant Name').mean().reset_index().sort_values('Aggregate rating',ascending=False).reset_index()
breakfast.drop("index",axis=1)
Restaurant Name | Aggregate rating | |
---|---|---|
0 | Chaayos | 3.812500 |
1 | Starbucks | 3.750000 |
2 | Costa Coffee | 3.450000 |
3 | Barista | 3.325000 |
4 | Dunkin' Donuts | 3.300000 |
5 | Cafe Coffee Day | 2.573684 |
import plotly.express as px
df= breakfast
fig = px.bar(df, y='Aggregate rating', x='Restaurant Name', text='Aggregate rating', title="Breakfast and Coffee locations")
fig.update_traces(texttemplate='%{text:.3s}', textposition='outside')
fig.update_layout(
autosize=False,
width=800,
height=500,)
fig.show()
Chaayos outlets are doing better. We need more of those in Delhi. Café coffee day seems to be performing poorly in avg rating. They are required to improve their services.
american = american[['Restaurant Name','Aggregate rating']].groupby('Restaurant Name').mean().reset_index().sort_values('Aggregate rating',ascending=False)
american
Restaurant Name | Aggregate rating | |
---|---|---|
0 | Burger King | 3.477778 |
3 | McDonald's | 3.445455 |
2 | Dunkin' Donuts | 3.300000 |
4 | Pizza Hut | 3.158333 |
5 | Subway | 3.047368 |
1 | Domino's Pizza | 2.794545 |
import plotly.express as px
df= american
fig = px.bar(df, y='Aggregate rating', x='Restaurant Name', text='Aggregate rating', title="Fast Food Resturants")
fig.update_traces(texttemplate='%{text:.3s}', textposition='outside')
fig.update_layout(
autosize=False,
width=800,
height=500,)
fig.show()
ice_cream = ice_cream[['Restaurant Name','Aggregate rating']].groupby('Restaurant Name').mean().reset_index().sort_values('Aggregate rating',ascending=False)
ice_cream
Restaurant Name | Aggregate rating | |
---|---|---|
5 | Starbucks | 3.750000 |
2 | Giani's | 3.011765 |
3 | Keventers | 2.983333 |
0 | Baskin Robbins | 2.769231 |
1 | Giani | 2.675000 |
4 | Nirula's Ice Cream | 2.400000 |
import plotly.express as px
df= ice_cream
fig = px.bar(df, y='Aggregate rating', x='Restaurant Name', text='Aggregate rating', title="Ice Cream Parlours")
fig.update_traces(texttemplate='%{text:.3s}', textposition='outside')
fig.update_layout(
autosize=False,
width=800,
height=500,)
fig.show()
Foreign brands seems to be doing better than the local brands
We’ve drawn many inferences from the survey. Here’s a summary of a few of them: