Lab 1: Statistics and Probability

In [ ]:

Copied!

import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.0f}'.format
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.0f}'.format

In [ ]:

Copied!

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Statistics/input/income.csv", names=['name', 'income'], skiprows=[0])
df
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Statistics/input/income.csv", names=['name', 'income'], skiprows=[0])
df

Out[ ]:

	name	income
0	Rob	5000
1	Rafiq	6000
2	Nina	4000
3	Sofia	7500
4	Mohan	8000
5	Tao	7000
6	Elon Musk	10000000

Describe the data

In [ ]:

Copied!

df.income.describe()
df.income.describe()

Out[ ]:

	income
count	7.000000e+00
mean	1.433929e+06
std	3.777283e+06
min	4.000000e+03
25%	5.500000e+03
50%	7.000000e+03
75%	7.750000e+03
max	1.000000e+07

dtype: float64

In [ ]:

Copied!

df.income.quantile(0)
df.income.quantile(0)

Out[ ]:

np.float64(4000.0)

In [ ]:

Copied!

df.income.quantile(0.25,interpolation="higher")
df.income.quantile(0.25,interpolation="higher")

Out[ ]:

np.int64(6000)

In [ ]:

Copied!

df.income.quantile(0.5,interpolation="higher")
df.income.quantile(0.5,interpolation="higher")

Out[ ]:

np.int64(7000)

In [ ]:

Copied!

df.income.quantile(0.75)
df.income.quantile(0.75)

Out[ ]:

np.float64(7750.0)

In [ ]:

Copied!

df.income.quantile(1)
df.income.quantile(1)

Out[ ]:

np.float64(10000000.0)

In [ ]:

Copied!

df.income.describe()
df.income.describe()

Out[ ]:

	income
count	7
mean	1,433,929
std	3,777,283
min	4,000
25%	5,500
50%	7,000
75%	7,750
max	10,000,000

dtype: float64

Calculating median

In [ ]:

Copied!

df.income.median()
df.income.median()

Out[ ]:

7000.0

Calculating IQR and QD

In [ ]:

Copied!

IQR = df.income.quantile(0.75) - df.income.quantile(0.25)
IQR
IQR = df.income.quantile(0.75) - df.income.quantile(0.25)
IQR

Out[ ]:

np.float64(2250.0)

Find and Remove Outlier using IQR technique

In [ ]:

Copied!





Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df.income < lower_bound) | (df.income > upper_bound)]

print(outliers)

# Option 1: Remove outliers
df = df[(df.income >= lower_bound) & (df.income <= upper_bound)]

# Option 2: Cap them (winsorization)
df.income = np.where(df.income > upper_bound, upper_bound,
              np.where(df.income < lower_bound, lower_bound, df.income))

df
Q1 = df.income.quantile(0.25)
Q3 = df.income.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df.income < lower_bound) | (df.income > upper_bound)]

print(outliers)

# Option 1: Remove outliers
df = df[(df.income >= lower_bound) & (df.income <= upper_bound)]

# Option 2: Cap them (winsorization)
df.income = np.where(df.income > upper_bound, upper_bound,
              np.where(df.income < lower_bound, lower_bound, df.income))

df

Empty DataFrame
Columns: [name, income]
Index: []

Out[ ]:

	name	income
0	Rob	5,000
1	Rafiq	6,000
2	Nina	4,000
3	Sofia	7,500
4	Mohan	8,000
5	Tao	7,000

In [ ]:

Copied!

sd = df.income.std()
print(f"Standard Deviation: {sd}")
sd = df.income.std()
print(f"Standard Deviation: {sd}")

Standard Deviation: 1541.103500742244

In [ ]:

Copied!

variance = df.income.var()
print(f"Variance: {variance}")
variance = df.income.var()
print(f"Variance: {variance}")

Variance: 2375000.0

In [ ]:

Copied!





# Calculate mean
mean_income = df.income.mean()

# Calculate variance using the formula
variance_formula = np.sum((df.income - mean_income)**2) / (len(df.income) - 1)
print(f"Variance (using formula): {variance_formula}")

# Calculate standard deviation using the formula
sd_formula = np.sqrt(variance_formula)
print(f"Standard Deviation (using formula): {sd_formula}")
# Calculate mean
mean_income = df.income.mean()

# Calculate variance using the formula
variance_formula = np.sum((df.income - mean_income)**2) / (len(df.income) - 1)
print(f"Variance (using formula): {variance_formula}")

# Calculate standard deviation using the formula
sd_formula = np.sqrt(variance_formula)
print(f"Standard Deviation (using formula): {sd_formula}")

Variance (using formula): 2375000.0
Standard Deviation (using formula): 1541.103500742244