Lec40_外れ値

In [24]:

import numpy as np
from pandas import DataFrame

In [2]:

# 乱数列の種を与えます。
np.random.seed(12345)
dframe = DataFrame(np.random.randn(1000,4))

In [3]:

dframe.head()

Out[3]:

	0	1	2	3
0	-0.204708	0.478943	-0.519439	-0.555730
1	1.965781	1.393406	0.092908	0.281746
2	0.769023	1.246435	1.007189	-1.296221
3	0.274992	0.228913	1.352917	0.886429
4	-2.001637	-0.371843	1.669025	-0.438570

In [4]:

dframe.tail()

Out[4]:

	0	1	2	3
995	1.089085	0.251232	-1.451985	1.653126
996	-0.478509	-0.010663	-1.060881	-1.502870
997	-1.946267	1.013592	0.037333	0.133304
998	-1.293122	-0.322542	-0.782960	-0.303340
999	0.089987	0.292291	1.177706	0.882755

In [5]:

# Lets describe the data
dframe.describe()

Out[5]:

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

In [6]:

# 最初の列
col = dframe[0]

In [7]:

col.head()

Out[7]:

0   -0.204708
1    1.965781
2    0.769023
3    0.274992
4   -2.001637
Name: 0, dtype: float64

In [8]:

# 3より大きい要素を取り出します。
col[np.abs(col)>3]

Out[8]:

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [19]:

# DataFrameの全列に対して同じ演算をします。
dframe[(np.abs(dframe)>3).any(1)]

Out[19]:

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

In [23]:

np.sign(dframe)

Out[23]:

	0	1	2	3
0	-1	1	-1	-1
1	1	1	1	1
2	1	1	1	-1
3	1	1	1	1
4	-1	-1	1	-1
5	-1	1	1	-1
6	-1	1	1	1
7	1	1	-1	-1
8	-1	-1	-1	1
9	-1	1	-1	1
10	-1	-1	-1	-1
11	-1	1	1	-1
12	1	1	1	1
13	-1	1	1	-1
14	-1	1	1	-1
15	1	-1	1	1
16	1	-1	-1	-1
17	-1	-1	-1	1
18	1	1	1	-1
19	-1	-1	-1	1
20	1	-1	1	1
21	1	1	-1	1
22	-1	1	1	1
23	-1	1	1	1
24	1	-1	1	1
25	-1	-1	-1	-1
26	-1	-1	1	1
27	-1	1	-1	1
28	1	-1	-1	-1
29	1	-1	1	-1
...	...	...	...	...
970	1	1	1	-1
971	-1	-1	-1	1
972	1	1	-1	-1
973	-1	1	1	-1
974	-1	-1	1	-1
975	-1	-1	1	-1
976	1	1	1	1
977	-1	1	1	1
978	1	-1	1	1
979	1	1	-1	1
980	-1	-1	-1	-1
981	-1	1	1	1
982	-1	1	1	1
983	-1	1	1	1
984	1	-1	1	-1
985	-1	1	1	1
986	-1	-1	1	-1
987	-1	-1	1	-1
988	-1	1	-1	-1
989	1	1	1	1
990	-1	-1	-1	1
991	1	-1	1	1
992	-1	1	-1	-1
993	1	1	-1	1
994	1	-1	1	1
995	1	1	-1	1
996	-1	-1	-1	-1
997	-1	1	1	1
998	-1	-1	-1	-1
999	1	1	1	1

1000 rows × 4 columns

In [20]:

# 絶対値の上限が3になるようにします。
dframe[np.abs(dframe)>3] = np.sign(dframe) *3

In [21]:

dframe.describe()

Out[21]:

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000