02 β Dataset Overview
InΒ [1]:
# Install pandas for data handling
%pip install pandas -q
# Install matplotlib for basic plotting
%pip install matplotlib -q
# Install seaborn for advanced visualizations
%pip install seaborn -q
Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages.
InΒ [2]:
# Import the necessary libraries
# For data manipulation and analysis
import pandas as pd
# For creating basic plots like line, bar, scatter, histogram
import matplotlib.pyplot as plt
# For advanced and statistical visualizations
import seaborn as sns
# To avoid warnings
import warnings
warnings.filterwarnings('ignore')
InΒ [3]:
# Import the dataset
df = pd.read_csv('/kaggle/input/health-insurance-lead-prediction-raw-data/Health Insurance Lead Prediction Raw Data.csv')
print("==================== IMPORT DATASET ====================")
print(df)
==================== IMPORT DATASET ====================
ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type \
0 1 C3 3213 Rented Individual
1 2 C5 1117 Owned Joint
2 3 C5 3732 Owned Individual
3 4 C24 4378 Owned Joint
4 5 C8 2190 Rented Individual
... ... ... ... ... ...
50877 50878 C4 845 Rented Individual
50878 50879 C5 4188 Rented Individual
50879 50880 C1 442 Rented Individual
50880 50881 C1 4 Owned Joint
50881 50882 C3 3866 Rented Individual
Upper_Age Lower_Age Is_Spouse Health Indicator \
0 36 36 No X1
1 75 22 No X2
2 32 32 No NaN
3 52 48 No X1
4 44 44 No X2
... ... ... ... ...
50877 22 22 No X3
50878 27 27 No X3
50879 63 63 No X2
50880 71 49 No X2
50881 24 24 No X3
Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat \
0 14+ 3.0 22
1 NaN NaN 22
2 1.0 1.0 19
3 14+ 3.0 19
4 3.0 1.0 16
... ... ... ...
50877 NaN NaN 18
50878 7.0 3.0 4
50879 14+ 1.0 12
50880 2.0 2.0 16
50881 2.0 3.0 18
Reco_Policy_Premium Response
0 11628.0 0
1 30510.0 0
2 7450.0 1
3 17780.0 0
4 10404.0 0
... ... ...
50877 7704.0 0
50878 5408.0 0
50879 11374.0 0
50880 28179.2 1
50881 11424.0 0
[50882 rows x 14 columns]
InΒ [4]:
# View the first 10 rows of dataset
print("==================== FIRST 10 ROWS ====================")
print(df.head(10))
==================== FIRST 10 ROWS ==================== ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type Upper_Age \ 0 1 C3 3213 Rented Individual 36 1 2 C5 1117 Owned Joint 75 2 3 C5 3732 Owned Individual 32 3 4 C24 4378 Owned Joint 52 4 5 C8 2190 Rented Individual 44 5 6 C9 1785 Rented Individual 52 6 7 C3 679 Owned Individual 28 7 8 C1 3175 Owned Joint 75 8 9 C15 3497 Owned Joint 52 9 10 C1 530 Owned Joint 59 Lower_Age Is_Spouse Health Indicator Holding_Policy_Duration \ 0 36 No X1 14+ 1 22 No X2 NaN 2 32 No NaN 1.0 3 48 No X1 14+ 4 44 No X2 3.0 5 52 No X2 5.0 6 28 No NaN NaN 7 73 Yes X4 9.0 8 43 No X1 14.0 9 26 Yes NaN 7.0 Holding_Policy_Type Reco_Policy_Cat Reco_Policy_Premium Response 0 3.0 22 11628.0 0 1 NaN 22 30510.0 0 2 1.0 19 7450.0 1 3 3.0 19 17780.0 0 4 1.0 16 10404.0 0 5 1.0 22 15264.0 1 6 NaN 17 10640.0 0 7 4.0 17 29344.0 1 8 3.0 1 27283.2 0 9 4.0 18 21100.8 1
InΒ [5]:
# View last 10 rows of dataset
print("==================== LAST 10 ROWS ====================")
print(df.tail(10))
==================== LAST 10 ROWS ====================
ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type \
50872 50873 C3 567 Rented Individual
50873 50874 C12 5450 Owned Individual
50874 50875 C6 195 Rented Individual
50875 50876 C6 231 Rented Individual
50876 50877 C26 579 Owned Individual
50877 50878 C4 845 Rented Individual
50878 50879 C5 4188 Rented Individual
50879 50880 C1 442 Rented Individual
50880 50881 C1 4 Owned Joint
50881 50882 C3 3866 Rented Individual
Upper_Age Lower_Age Is_Spouse Health Indicator \
50872 61 61 No X2
50873 65 65 No X2
50874 26 26 No X1
50875 36 36 No X3
50876 37 37 No NaN
50877 22 22 No X3
50878 27 27 No X3
50879 63 63 No X2
50880 71 49 No X2
50881 24 24 No X3
Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat \
50872 4.0 2.0 22
50873 6.0 3.0 18
50874 5.0 1.0 20
50875 2.0 1.0 20
50876 2.0 1.0 12
50877 NaN NaN 18
50878 7.0 3.0 4
50879 14+ 1.0 12
50880 2.0 2.0 16
50881 2.0 3.0 18
Reco_Policy_Premium Response
50872 20730.0 0
50873 21536.0 0
50874 7524.0 0
50875 13574.0 1
50876 13222.0 0
50877 7704.0 0
50878 5408.0 0
50879 11374.0 0
50880 28179.2 1
50881 11424.0 0
InΒ [6]:
# Check the columns in the dataset
print("==================== COLUMNS ====================")
for col in df.columns:
print(col)
==================== COLUMNS ==================== ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type Upper_Age Lower_Age Is_Spouse Health Indicator Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat Reco_Policy_Premium Response
InΒ [7]:
# Check the datatypes of columns
print("==================== DATATYPES ====================")
print(df.info())
==================== DATATYPES ==================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 50882 entries, 0 to 50881 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 50882 non-null int64 1 City_Code 50882 non-null object 2 Region_Code 50882 non-null int64 3 Accomodation_Type 50882 non-null object 4 Reco_Insurance_Type 50882 non-null object 5 Upper_Age 50882 non-null int64 6 Lower_Age 50882 non-null int64 7 Is_Spouse 50882 non-null object 8 Health Indicator 39191 non-null object 9 Holding_Policy_Duration 30631 non-null object 10 Holding_Policy_Type 30631 non-null float64 11 Reco_Policy_Cat 50882 non-null int64 12 Reco_Policy_Premium 50882 non-null float64 13 Response 50882 non-null int64 dtypes: float64(2), int64(6), object(6) memory usage: 5.4+ MB None
InΒ [8]:
# Check for missing values
print("==================== MISSING VALUES ====================")
print(df.isnull().sum())
==================== MISSING VALUES ==================== ID 0 City_Code 0 Region_Code 0 Accomodation_Type 0 Reco_Insurance_Type 0 Upper_Age 0 Lower_Age 0 Is_Spouse 0 Health Indicator 11691 Holding_Policy_Duration 20251 Holding_Policy_Type 20251 Reco_Policy_Cat 0 Reco_Policy_Premium 0 Response 0 dtype: int64
InΒ [9]:
# Check the statistics of dataset
print("==================== STATISTICS ====================")
print(df.describe())
==================== STATISTICS ====================
ID Region_Code Upper_Age Lower_Age \
count 50882.000000 50882.000000 50882.000000 50882.000000
mean 25441.500000 1732.788707 44.856275 42.738866
std 14688.512535 1424.081652 17.310271 17.319375
min 1.000000 1.000000 18.000000 16.000000
25% 12721.250000 523.000000 28.000000 27.000000
50% 25441.500000 1391.000000 44.000000 40.000000
75% 38161.750000 2667.000000 59.000000 57.000000
max 50882.000000 6194.000000 75.000000 75.000000
Holding_Policy_Type Reco_Policy_Cat Reco_Policy_Premium Response
count 30631.000000 50882.000000 50882.000000 50882.000000
mean 2.439228 15.115188 14183.950069 0.239947
std 1.025923 6.340663 6590.074873 0.427055
min 1.000000 1.000000 2280.000000 0.000000
25% 1.000000 12.000000 9248.000000 0.000000
50% 3.000000 17.000000 13178.000000 0.000000
75% 3.000000 20.000000 18096.000000 0.000000
max 4.000000 22.000000 43350.400000 1.000000
InΒ [10]:
# Check the number of unique values in each columns
print("==================== UNIQUE VALUES PER COLUMN ====================")
print(df.nunique())
==================== UNIQUE VALUES PER COLUMN ==================== ID 50882 City_Code 36 Region_Code 5316 Accomodation_Type 2 Reco_Insurance_Type 2 Upper_Age 58 Lower_Age 60 Is_Spouse 2 Health Indicator 9 Holding_Policy_Duration 15 Holding_Policy_Type 4 Reco_Policy_Cat 22 Reco_Policy_Premium 6977 Response 2 dtype: int64
InΒ [11]:
# Find the percentage of missing values in column
print("==================== MISSING VALUES PERCENTAGE ====================")
print(df.isnull().sum()/ len(df) * 100)
==================== MISSING VALUES PERCENTAGE ==================== ID 0.000000 City_Code 0.000000 Region_Code 0.000000 Accomodation_Type 0.000000 Reco_Insurance_Type 0.000000 Upper_Age 0.000000 Lower_Age 0.000000 Is_Spouse 0.000000 Health Indicator 22.976691 Holding_Policy_Duration 39.799929 Holding_Policy_Type 39.799929 Reco_Policy_Cat 0.000000 Reco_Policy_Premium 0.000000 Response 0.000000 dtype: float64
03 β Data Cleaning
InΒ [12]:
# Delete duplicate values from dataset
print("==================== DROP DUPLICATES ====================")
df.drop_duplicates()
print(df)
# The dataset does not have any duplicates
==================== DROP DUPLICATES ====================
ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type \
0 1 C3 3213 Rented Individual
1 2 C5 1117 Owned Joint
2 3 C5 3732 Owned Individual
3 4 C24 4378 Owned Joint
4 5 C8 2190 Rented Individual
... ... ... ... ... ...
50877 50878 C4 845 Rented Individual
50878 50879 C5 4188 Rented Individual
50879 50880 C1 442 Rented Individual
50880 50881 C1 4 Owned Joint
50881 50882 C3 3866 Rented Individual
Upper_Age Lower_Age Is_Spouse Health Indicator \
0 36 36 No X1
1 75 22 No X2
2 32 32 No NaN
3 52 48 No X1
4 44 44 No X2
... ... ... ... ...
50877 22 22 No X3
50878 27 27 No X3
50879 63 63 No X2
50880 71 49 No X2
50881 24 24 No X3
Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat \
0 14+ 3.0 22
1 NaN NaN 22
2 1.0 1.0 19
3 14+ 3.0 19
4 3.0 1.0 16
... ... ... ...
50877 NaN NaN 18
50878 7.0 3.0 4
50879 14+ 1.0 12
50880 2.0 2.0 16
50881 2.0 3.0 18
Reco_Policy_Premium Response
0 11628.0 0
1 30510.0 0
2 7450.0 1
3 17780.0 0
4 10404.0 0
... ... ...
50877 7704.0 0
50878 5408.0 0
50879 11374.0 0
50880 28179.2 1
50881 11424.0 0
[50882 rows x 14 columns]
InΒ [13]:
# Drop any useless column
print("==================== DROP RESPONSE COLUMN ====================")
df = df.drop(columns = "Response")
print(df)
==================== DROP RESPONSE COLUMN ====================
ID City_Code Region_Code Accomodation_Type Reco_Insurance_Type \
0 1 C3 3213 Rented Individual
1 2 C5 1117 Owned Joint
2 3 C5 3732 Owned Individual
3 4 C24 4378 Owned Joint
4 5 C8 2190 Rented Individual
... ... ... ... ... ...
50877 50878 C4 845 Rented Individual
50878 50879 C5 4188 Rented Individual
50879 50880 C1 442 Rented Individual
50880 50881 C1 4 Owned Joint
50881 50882 C3 3866 Rented Individual
Upper_Age Lower_Age Is_Spouse Health Indicator \
0 36 36 No X1
1 75 22 No X2
2 32 32 No NaN
3 52 48 No X1
4 44 44 No X2
... ... ... ... ...
50877 22 22 No X3
50878 27 27 No X3
50879 63 63 No X2
50880 71 49 No X2
50881 24 24 No X3
Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat \
0 14+ 3.0 22
1 NaN NaN 22
2 1.0 1.0 19
3 14+ 3.0 19
4 3.0 1.0 16
... ... ... ...
50877 NaN NaN 18
50878 7.0 3.0 4
50879 14+ 1.0 12
50880 2.0 2.0 16
50881 2.0 3.0 18
Reco_Policy_Premium
0 11628.0
1 30510.0
2 7450.0
3 17780.0
4 10404.0
... ...
50877 7704.0
50878 5408.0
50879 11374.0
50880 28179.2
50881 11424.0
[50882 rows x 13 columns]
InΒ [14]:
# Change column name of column Accomodation_Type
print("==================== RENAME ACCOMODATION COLUMN ====================")
df = df.rename(columns={"Accomodation_Type": "Acomodation"})
print(df)
==================== RENAME ACCOMODATION COLUMN ====================
ID City_Code Region_Code Acomodation Reco_Insurance_Type \
0 1 C3 3213 Rented Individual
1 2 C5 1117 Owned Joint
2 3 C5 3732 Owned Individual
3 4 C24 4378 Owned Joint
4 5 C8 2190 Rented Individual
... ... ... ... ... ...
50877 50878 C4 845 Rented Individual
50878 50879 C5 4188 Rented Individual
50879 50880 C1 442 Rented Individual
50880 50881 C1 4 Owned Joint
50881 50882 C3 3866 Rented Individual
Upper_Age Lower_Age Is_Spouse Health Indicator \
0 36 36 No X1
1 75 22 No X2
2 32 32 No NaN
3 52 48 No X1
4 44 44 No X2
... ... ... ... ...
50877 22 22 No X3
50878 27 27 No X3
50879 63 63 No X2
50880 71 49 No X2
50881 24 24 No X3
Holding_Policy_Duration Holding_Policy_Type Reco_Policy_Cat \
0 14+ 3.0 22
1 NaN NaN 22
2 1.0 1.0 19
3 14+ 3.0 19
4 3.0 1.0 16
... ... ... ...
50877 NaN NaN 18
50878 7.0 3.0 4
50879 14+ 1.0 12
50880 2.0 2.0 16
50881 2.0 3.0 18
Reco_Policy_Premium
0 11628.0
1 30510.0
2 7450.0
3 17780.0
4 10404.0
... ...
50877 7704.0
50878 5408.0
50879 11374.0
50880 28179.2
50881 11424.0
[50882 rows x 13 columns]
InΒ [15]:
# Find all the unique values in column Health Indicator
print("==================== HEALTH INDICATOR UNIQUE VALUES ====================")
print(df['Health Indicator'].unique())
==================== HEALTH INDICATOR UNIQUE VALUES ==================== ['X1' 'X2' nan 'X4' 'X3' 'X6' 'X5' 'X8' 'X7' 'X9']
InΒ [16]:
# Find the value counts of each value in column Health Indicator
print("==================== HEALTH INDICATOR VALUE COUNTS ====================")
print(df['Health Indicator'].value_counts())
==================== HEALTH INDICATOR VALUE COUNTS ==================== Health Indicator X1 13010 X2 10332 X3 6762 X4 5743 X5 1727 X6 1280 X7 196 X8 78 X9 63 Name: count, dtype: int64
InΒ [17]:
# Fill the empty values of column Health Indicator with mode
print("==================== HEALTH INDICATOR AFTER FILLNA ====================")
df['Health Indicator'] = df['Health Indicator'].fillna(df['Health Indicator'].mode()[0])
print(df['Health Indicator'])
==================== HEALTH INDICATOR AFTER FILLNA ====================
0 X1
1 X2
2 X1
3 X1
4 X2
..
50877 X3
50878 X3
50879 X2
50880 X2
50881 X3
Name: Health Indicator, Length: 50882, dtype: object
InΒ [18]:
# Find all the unique values in column Holding_Policy_Type
print("==================== HOLDING POLICY TYPE UNIQUE VALUES ====================")
print(df['Holding_Policy_Type'].unique())
==================== HOLDING POLICY TYPE UNIQUE VALUES ==================== [ 3. nan 1. 4. 2.]
InΒ [19]:
# Find the value counts of each value in column Holding_Policy_Type
print("==================== HOLDING POLICY TYPE VALUE COUNTS ====================")
print(df['Holding_Policy_Type'].value_counts())
==================== HOLDING POLICY TYPE VALUE COUNTS ==================== Holding_Policy_Type 3.0 13279 1.0 8173 2.0 5005 4.0 4174 Name: count, dtype: int64
InΒ [20]:
# Fill the empty values of column Holding_Policy_Type with mode
print("==================== HOLDING POLICY TYPE AFTER FILLNA ====================")
df['Holding_Policy_Type'] = df['Holding_Policy_Type'].fillna(df['Holding_Policy_Type'].mode()[0])
print(df['Holding_Policy_Duration'])
==================== HOLDING POLICY TYPE AFTER FILLNA ====================
0 14+
1 NaN
2 1.0
3 14+
4 3.0
...
50877 NaN
50878 7.0
50879 14+
50880 2.0
50881 2.0
Name: Holding_Policy_Duration, Length: 50882, dtype: object
InΒ [21]:
# Find the unique value in column Holding_Policy_Duration
print("==================== HOLDING POLICY DURATION UNIQUE VALUES ====================")
print(df['Holding_Policy_Duration'].unique())
==================== HOLDING POLICY DURATION UNIQUE VALUES ==================== ['14+' nan '1.0' '3.0' '5.0' '9.0' '14.0' '7.0' '2.0' '11.0' '10.0' '8.0' '6.0' '4.0' '13.0' '12.0']
InΒ [22]:
# Find the value counts of each value in column Holding_Policy_Duration
print("==================== HOLDING POLICY DURATION VALUE COUNTS ====================")
print(df['Holding_Policy_Duration'].value_counts())
==================== HOLDING POLICY DURATION VALUE COUNTS ==================== Holding_Policy_Duration 1.0 4499 14+ 4335 2.0 4260 3.0 3586 4.0 2771 5.0 2362 6.0 1894 7.0 1645 8.0 1316 9.0 1114 10.0 813 11.0 546 12.0 513 13.0 511 14.0 466 Name: count, dtype: int64
InΒ [23]:
# Replace the incorrect format with correct data
print("==================== HOLDING POLICY DURATION AFTER REPLACE ====================")
df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].replace('14+', 14.0)
print(df['Holding_Policy_Duration'])
==================== HOLDING POLICY DURATION AFTER REPLACE ====================
0 14.0
1 NaN
2 1.0
3 14.0
4 3.0
...
50877 NaN
50878 7.0
50879 14.0
50880 2.0
50881 2.0
Name: Holding_Policy_Duration, Length: 50882, dtype: object
InΒ [24]:
# Find the unique value in column Holding_Policy_Duration
print("==================== HOLDING POLICY DURATION AFTER REPLACE ====================")
print(df['Holding_Policy_Duration'].unique())
==================== HOLDING POLICY DURATION AFTER REPLACE ==================== [14.0 nan '1.0' '3.0' '5.0' '9.0' '14.0' '7.0' '2.0' '11.0' '10.0' '8.0' '6.0' '4.0' '13.0' '12.0']
InΒ [25]:
# Change the datatype of Holding_Policy_Duration from object to float
print("==================== HOLDING POLICY DURATION AFTER CHANGING DATATYPE ====================")
df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].astype('float32')
print(df['Holding_Policy_Duration'])
==================== HOLDING POLICY DURATION AFTER CHANGING DATATYPE ====================
0 14.0
1 NaN
2 1.0
3 14.0
4 3.0
...
50877 NaN
50878 7.0
50879 14.0
50880 2.0
50881 2.0
Name: Holding_Policy_Duration, Length: 50882, dtype: float32
InΒ [26]:
# Fill empty value of Holding_Policy_Duration with median
print("==================== HOLDING POLICY DURATION AFTER FILLNA ====================")
df['Holding_Policy_Duration'].fillna(df['Holding_Policy_Duration'].median(), inplace = True)
print(df['Holding_Policy_Duration'])
==================== HOLDING POLICY DURATION AFTER FILLNA ====================
0 14.0
1 5.0
2 1.0
3 14.0
4 3.0
...
50877 5.0
50878 7.0
50879 14.0
50880 2.0
50881 2.0
Name: Holding_Policy_Duration, Length: 50882, dtype: float32
04 β Exploratory Data Analysis
InΒ [27]:
# 1. Are there outliers in Reco_Policy_Premium, Holding_Policy_Duration, Upper_Age and Lower_Age
# Column that could have outliers
cols = ['Upper_Age', 'Lower_Age', 'Reco_Policy_Premium', 'Holding_Policy_Duration']
# Size of the plot
plt.figure(figsize = (15,5))
# Ploting four column in diagram
for i, col in enumerate(cols, 1):
plt.subplot(1,4,i)
sns.boxplot( y = df[col])
plt.title (f"Boxplot of {col}", fontsize= 12)
plt.show()
InΒ [28]:
# Calculate the first quartile (Q1) of Holding_Policy_Duration
q1 = df['Holding_Policy_Duration'].quantile(0.25)
# Calculate the third quartile (Q3) of Holding_Policy_Duration
q3 = df['Holding_Policy_Duration'].quantile(0.75)
# Calculate the Interquartile Range (IQR)
IQR = q3 - q1
# Define the upper limit and lower limit for outliers
lowerlimit = q1 - 1.5 * IQR
upperlimit = q3 - 1.5 * IQR
# Remove rows containing outliers in Holding_Policy_Duration
df = df[(df['Holding_Policy_Duration'] >= lowerlimit) & (df['Holding_Policy_Duration'] <= upperlimit)]
InΒ [29]:
# 2. What is the relationship of Upper_Age and Lower_Age
# Plot the diagram
sns.relplot (x = "Upper_Age", y = "Lower_Age", data = df)
plt.title("Range of upper age and lower age")
plt.xlabel("Upper Age")
plt.ylabel("Lower Age")
plt.show()
InΒ [30]:
# 3. What is the distribution of Health Indicator
# Plot the diagram
sns.displot(df['Health Indicator'])
plt.title("Distribution of Health Indicators")
plt.show()
InΒ [31]:
# 4. What is the density of Reco_Policy_Cat
# For plot diagram
sns.kdeplot(df['Reco_Policy_Cat'].dropna(), shade=True, color='green', label='Reco Policy Cat')
plt.figure(figsize=(10,6))
plt.show()
<Figure size 1000x600 with 0 Axes>
InΒ [32]:
# 5. Which city has the highest number of customers
# For countplot diagram
plt.figure(figsize=(12,6))
sns.countplot(x='City_Code', data=df,)
plt.title("Number of Customers by City Code", fontsize=14)
plt.show()
InΒ [33]:
# 6. How does accomodation type affect is spouse
# For countplot diagram
plt.figure(figsize=(8,5))
sns.countplot(x='Acomodation', hue='Is_Spouse', data=df, palette=['black','pink'])
plt.title("Accommodation type and Spouse Status")
plt.show()
InΒ [34]:
# 7. Which Region_Code has the highest average Reco_Policy_Premium
# Calculate the top 10 regions with highest average Reco_Policy_Premium
Highest_Premium = (df.groupby('Region_Code')['Reco_Policy_Premium'].mean().sort_values(ascending = False).head(10))
print(Highest_Premium)
# For bar plot diagram
plt.figure(figsize=(10,6))
Highest_Premium.plot(kind='bar', color='black')
plt.title("Top 10 Regions by Average Reco_Policy_Premium", fontsize=14)
plt.xlabel("Region Code")
plt.ylabel("Reco_Policy_Premium")
plt.show()
Region_Code 1341 39960.0 2930 37734.0 2458 36966.4 4048 36556.8 5844 35305.6 4678 34764.8 5082 34217.6 2922 34201.6 2728 34099.2 3234 34041.6 Name: Reco_Policy_Premium, dtype: float64
InΒ [35]:
# 8. Does accommodation type (Owned vs Rented) affect the holding policy type
# Group by Acomodation and calculate average Holding_Policy_Type
avg_policy_type = df.groupby('Acomodation')['Holding_Policy_Type'].mean()
print(avg_policy_type)
# For bar plot idagram
plt.figure(figsize=(6,4))
avg_policy_type.plot(kind='bar', color=['skyblue', 'pink'])
plt.title("Average Holding Policy Type by Accommodation")
plt.ylabel("Holding Policy Type")
plt.xlabel("Accommodation Type")
plt.show()
Acomodation Owned 2.440845 Rented 2.119544 Name: Holding_Policy_Type, dtype: float64
InΒ [36]:
# 9. What is the most common Reco_Insurance_Type
# Count the occurrences of each type of recommended insurance
common_reco_type = df['Reco_Insurance_Type'].value_counts()
print(common_reco_type)
# For plot diagram
plt.figure(figsize = (10,6))
plt.barh(common_reco_type.index, common_reco_type.values, color='purple', edgecolor = 'black')
plt.xlabel("Insurance type")
plt.ylabel("Numbers of cutomer")
plt.show()
Reco_Insurance_Type Individual 9863 Joint 2482 Name: count, dtype: int64
InΒ [37]:
# 10. How does the duration of holding a policy affect the recommended policy premium
# Calculate average Reco_Policy_Premium for each Holding_Policy_Duration
top = df.groupby('Holding_Policy_Duration')['Reco_Policy_Premium'].mean()
print(top)
# For line diagram
plt.figure(figsize = (7,5))
sns.lineplot(x=top.index, y=top.values, marker='o', color='red')
plt.title("Average Reco Policy Premium by Holding Policy Duration")
plt.ylabel("Reco_Policy_Premium")
plt.show()
Holding_Policy_Duration 1.0 13671.302556 2.0 13897.180563 3.0 14366.722476 Name: Reco_Policy_Premium, dtype: float64
05 β Key Insights
06 β Conclusion
