Lab 1: Statistics and Probability
In [ ]:
Copied!
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.0f}'.format
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.0f}'.format
In [ ]:
Copied!
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Statistics/input/income.csv", names=['name', 'income'], skiprows=[0])
df
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Statistics/input/income.csv", names=['name', 'income'], skiprows=[0])
df
Out[ ]:
| name | income | |
|---|---|---|
| 0 | Rob | 5000 |
| 1 | Rafiq | 6000 |
| 2 | Nina | 4000 |
| 3 | Sofia | 7500 |
| 4 | Mohan | 8000 |
| 5 | Tao | 7000 |
| 6 | Elon Musk | 10000000 |
Describe the data
In [ ]:
Copied!
df.income.describe()
df.income.describe()
Out[ ]:
| income | |
|---|---|
| count | 7.000000e+00 |
| mean | 1.433929e+06 |
| std | 3.777283e+06 |
| min | 4.000000e+03 |
| 25% | 5.500000e+03 |
| 50% | 7.000000e+03 |
| 75% | 7.750000e+03 |
| max | 1.000000e+07 |
In [ ]:
Copied!
df.income.quantile(0)
df.income.quantile(0)
Out[ ]:
np.float64(4000.0)
In [ ]:
Copied!
df.income.quantile(0.25,interpolation="higher")
df.income.quantile(0.25,interpolation="higher")
Out[ ]:
np.int64(6000)
In [ ]:
Copied!
df.income.quantile(0.5,interpolation="higher")
df.income.quantile(0.5,interpolation="higher")
Out[ ]:
np.int64(7000)
In [ ]:
Copied!
df.income.quantile(0.75)
df.income.quantile(0.75)
Out[ ]:
np.float64(7750.0)
In [ ]:
Copied!
df.income.quantile(1)
df.income.quantile(1)
Out[ ]:
np.float64(10000000.0)
In [ ]:
Copied!
df.income.describe()
df.income.describe()
Out[ ]:
| income | |
|---|---|
| count | 7 |
| mean | 1,433,929 |
| std | 3,777,283 |
| min | 4,000 |
| 25% | 5,500 |
| 50% | 7,000 |
| 75% | 7,750 |
| max | 10,000,000 |
Calculating median
In [ ]:
Copied!
df.income.median()
df.income.median()
Out[ ]:
7000.0
Calculating IQR and QD
In [ ]:
Copied!
IQR = df.income.quantile(0.75) - df.income.quantile(0.25)
IQR
IQR = df.income.quantile(0.75) - df.income.quantile(0.25)
IQR
Out[ ]:
np.float64(2250.0)
Find and Remove Outlier using IQR technique
In [ ]:
Copied!
Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identify outliers
outliers = df[(df.income < lower_bound) | (df.income > upper_bound)]
print(outliers)
# Option 1: Remove outliers
df = df[(df.income >= lower_bound) & (df.income <= upper_bound)]
# Option 2: Cap them (winsorization)
df.income = np.where(df.income > upper_bound, upper_bound,
np.where(df.income < lower_bound, lower_bound, df.income))
df
Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identify outliers
outliers = df[(df.income < lower_bound) | (df.income > upper_bound)]
print(outliers)
# Option 1: Remove outliers
df = df[(df.income >= lower_bound) & (df.income <= upper_bound)]
# Option 2: Cap them (winsorization)
df.income = np.where(df.income > upper_bound, upper_bound,
np.where(df.income < lower_bound, lower_bound, df.income))
df
Empty DataFrame Columns: [name, income] Index: []
Out[ ]:
| name | income | |
|---|---|---|
| 0 | Rob | 5,000 |
| 1 | Rafiq | 6,000 |
| 2 | Nina | 4,000 |
| 3 | Sofia | 7,500 |
| 4 | Mohan | 8,000 |
| 5 | Tao | 7,000 |
In [ ]:
Copied!
sd = df.income.std()
print(f"Standard Deviation: {sd}")
sd = df.income.std()
print(f"Standard Deviation: {sd}")
Standard Deviation: 1541.103500742244
In [ ]:
Copied!
variance = df.income.var()
print(f"Variance: {variance}")
variance = df.income.var()
print(f"Variance: {variance}")
Variance: 2375000.0
In [ ]:
Copied!
# Calculate mean
mean_income = df.income.mean()
# Calculate variance using the formula
variance_formula = np.sum((df.income - mean_income)**2) / (len(df.income) - 1)
print(f"Variance (using formula): {variance_formula}")
# Calculate standard deviation using the formula
sd_formula = np.sqrt(variance_formula)
print(f"Standard Deviation (using formula): {sd_formula}")
# Calculate mean
mean_income = df.income.mean()
# Calculate variance using the formula
variance_formula = np.sum((df.income - mean_income)**2) / (len(df.income) - 1)
print(f"Variance (using formula): {variance_formula}")
# Calculate standard deviation using the formula
sd_formula = np.sqrt(variance_formula)
print(f"Standard Deviation (using formula): {sd_formula}")
Variance (using formula): 2375000.0 Standard Deviation (using formula): 1541.103500742244