Basically I’m trying to split the data into numerical and categorical variables and then look at them separately to identify outliers that are three standard deviations away from the mean of that variable.
so I have the code but im not sure about it. please give me code for the requirements.
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('training_data.csv')
num_only = df._get_numeric_data()
outliers=[]
def detect_outlier(num_only):
threshold=3
mean_1 = mean(num_only)
std_1 = std(num_only)
for y in num_only:
zscore = float(np.abs((y - mean_1)/std_1))
if zscore > threshold:
outliers.append(y)
return outliers
outlier_datapoints = detect_outlier(num_only)
print(outlier_datapoints)
Code to identify outliers:-
import numpy as np
import matplotlib.pyplot as plt
seed(1)
anomalies = []
data = np.random.randn(50000) * 20 + 20
def find_anomalies(random_data):
random_data_std = std(random_data)
random_data_mean = mean(random_data)
anomaly_cut_off = random_data_std * 3
lower_limit = random_data_mean - anomaly_cut_off
upper_limit = random_data_mean + anomaly_cut_off
print(lower_limit)
for outlier in random_data:
if outlier > upper_limit or outlier < lower_limit:
anomalies.append(outlier)
return anomalies
find_anomalies(data)
Get Answers For Free
Most questions answered within 1 hours.