import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print("👽 Hello.")

👽 Hello.


import os

csv_file_path = os.getenv('HOME')+'/aiffel/data_preprocess/data/trade.csv'
trade = pd.read_csv(csv_file_path) 
trade.head()


print('전체 데이터 건수:', len(trade))

전체 데이터 건수: 199


print('컬럼별 결측치 개수')
len(trade) - trade.count()

컬럼별 결측치 개수

기간        0
국가명       0
수출건수      3
수출금액      4
수입건수      3
수입금액      3
무역수지      4
기타사항    199
dtype: int64


# 기타사항은 모두 결측치인 컬럼이라 제거
trade = trade.drop('기타사항', axis=1)
trade.head()


trade.isnull()


trade.isnull().any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
194    False
195    False
196     True
197     True
198     True
Length: 199, dtype: bool


# 값이 True인 데이터만 추출
trade[trade.isnull().any(axis=1)]


# 수출건수, 수출금액, 수입건수, 수입금액, 무역수지가 결측치라 삭제
trade.dropna(how='all', subset=['수출건수', '수출금액', '수입건수', '수입금액', '무역수지'], inplace=True)
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


trade[trade.isnull().any(axis=1)]


# 데이터 라벨 출력
trade.loc[[188, 191, 194]]


trade.loc[191, '수출금액'] = (trade.loc[188, '수출금액'] + trade.loc[194, '수출금액'] )/2
trade.loc[[191]]


trade.loc[191, '무역수지'] = trade.loc[191, '수출금액'] - trade.loc[191, '수입금액'] 
trade.loc[[191]]


# 중복된 데이터 여부를 불리언 값으로 반환
trade.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
191    False
192    False
193    False
194    False
195    False
Length: 196, dtype: bool


trade[trade.duplicated()]


trade[(trade['기간']=='2020년 03월')&(trade['국가명']=='중국')]


trade.drop_duplicates(inplace=True)
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


def outlier(df, col, z):
    return df[abs(df[col] - np.mean(df[col]))/np.std(df[col])>z].index
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


trade.loc[outlier(trade, '무역수지', 1.5)]


trade.loc[outlier(trade, '무역수지', 2)]


trade.loc[outlier(trade, '무역수지', 3)]


# 이상치를 제외한 데이터만 추출
def not_outlier(df, col, z):
    return df[abs(df[col] - np.mean(df[col]))/np.std(df[col]) <= z].index
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


trade.loc[not_outlier(trade, '무역수지', 1.5)]


np.random.seed(2020)
data = np.random.randn(100)  # 평균 0, 표준편차 1의 분포에서 100개의 숫자를 샘플링한 데이터 생성
data = np.concatenate((data, np.array([8, 10, -3, -5])))      # [8, 10, -3, -5])를 데이터 뒤에 추가함
data

array([-1.76884571,  0.07555227, -1.1306297 , -0.65143017, -0.89311563,
       -1.27410098, -0.06115443,  0.06451384,  0.41011295, -0.57288249,
       -0.80133362,  1.31203519,  1.27469887, -1.2143576 ,  0.31371941,
       -1.44482142, -0.3689613 , -0.76922658,  0.3926161 ,  0.05729383,
        2.08997884,  0.04197131, -0.04834072, -0.51315392, -0.08458928,
       -1.21545008, -1.41293073, -1.48691055,  0.38222486,  0.937673  ,
        1.77267804,  0.87882801,  0.33171912, -0.30603567,  1.24026615,
       -0.21562684,  0.15592948,  0.09805553,  0.83209585,  2.04520542,
       -0.31681392, -1.31283291, -1.75445746,  0.10209408, -1.36150208,
        0.48178488, -0.20832874, -0.09186351,  0.70268816,  0.10365506,
        0.62123638,  0.95411497,  2.03781352, -0.48445122,  0.2071549 ,
        1.64424216, -0.4882074 , -0.01782826,  0.46891556,  0.27987266,
       -0.64664972, -0.54406002, -0.16008985,  0.03781172,  1.03383296,
       -1.23096117, -1.24673665,  0.29572055,  2.1409624 , -0.92020227,
       -0.06000238,  0.27978391, -1.53126966, -0.30293101, -0.14601413,
        0.27746159, -0.13952066,  0.69515966, -0.11338746, -1.233267  ,
       -0.79614131, -0.46739138,  0.65890607, -0.41063115,  0.17344356,
        0.28946174,  1.03451736,  1.22661712,  1.71998252,  0.40806834,
        0.32256894,  1.04722748, -1.8196003 , -0.42582157,  0.12454883,
        2.31256634, -0.96557586, -0.34627486,  0.96668378, -0.92550192,
        8.        , 10.        , -3.        , -5.        ])


# 추가한 데이터 값이 벗어나있음
fig, ax = plt.subplots()
ax.boxplot(data)
plt.show()


Q3, Q1 = np.percentile(data, [75 ,25])
IQR = Q3 - Q1
IQR

1.1644925829790964


data[(Q1-1.5*IQR > data)|(Q3+1.5*IQR < data)]

array([ 2.31256634,  8.        , 10.        , -3.        , -5.        ])


# 무역수지를 사분위함수로 이상치 검색
# 데이터의 percentile이 아닌 quantile로 범위를 설정하였다.
def outlier2(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    return df[(df[col] < q1-1.5*iqr)|(df[col] > q3+1.5*iqr)]

outlier2(trade, '무역수지')


# 정규분포를 따라 랜덤하게 데이터 x를 생성합니다. 
np.random.seed(2020)
x = pd.DataFrame({'A': np.random.randn(100)*4+4,
                 'B': np.random.randn(100)-1})
x


# 데이터 x를 Standardization 기법으로 정규화합니다. 
x_standardization = (x - x.mean())/x.std()
x_standardization


# 데이터 x를 min-max scaling 기법으로 정규화합니다. 
x_min_max = (x-x.min())/(x.max()-x.min())
x_min_max


# 각 컬럼의 평균은 0으로, 분산은 1로 데이터를 바꿔줍니다.
fig, axs = plt.subplots(1,2, figsize=(12, 4),
                        gridspec_kw={'width_ratios': [2, 1]})

axs[0].scatter(x['A'], x['B'])
axs[0].set_xlim(-5, 15)
axs[0].set_ylim(-5, 5)
axs[0].axvline(c='grey', lw=1)
axs[0].axhline(c='grey', lw=1)
axs[0].set_title('Original Data')

axs[1].scatter(x_standardization['A'], x_standardization['B'])
axs[1].set_xlim(-5, 5)
axs[1].set_ylim(-5, 5)
axs[1].axvline(c='grey', lw=1)
axs[1].axhline(c='grey', lw=1)
axs[1].set_title('Data after standardization')

plt.show()


# 각 컬럼의 최솟값은 0, 최댓값은 1로 바꿔줍니다.
fig, axs = plt.subplots(1,2, figsize=(12, 4),
                        gridspec_kw={'width_ratios': [2, 1]})

axs[0].scatter(x['A'], x['B'])
axs[0].set_xlim(-5, 15)
axs[0].set_ylim(-5, 5)
axs[0].axvline(c='grey', lw=1)
axs[0].axhline(c='grey', lw=1)
axs[0].set_title('Original Data')

axs[1].scatter(x_min_max['A'], x_min_max['B'])
axs[1].set_xlim(-5, 5)
axs[1].set_ylim(-5, 5)
axs[1].axvline(c='grey', lw=1)
axs[1].axhline(c='grey', lw=1)
axs[1].set_title('Data after min-max scaling')

plt.show()


# trade 데이터를 Standardization 기법으로 정규화합니다. 
cols = ['수출건수', '수출금액', '수입건수', '수입금액', '무역수지']
trade_Standardization= (trade[cols]-trade[cols].mean())/trade[cols].std()
trade_Standardization.head()


trade_Standardization.describe()


# trade 데이터를 min-max scaling 기법으로 정규화합니다. 
trade[cols] = (trade[cols]-trade[cols].min())/(trade[cols].max()-trade[cols].min())
trade.head()


trade.describe()


# train, test를 같이 정규화해줘야한다
train = pd.DataFrame([[10, -10], [30, 10], [50, 0]])
test = pd.DataFrame([[0, 1], [10, 10]])
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


train_min = train.min()
train_max = train.max()

train_min_max = (train - train_min)/(train_max - train_min)
test_min_max =  (test - train_min)/(train_max - train_min)    # test를 min-max scaling할 때도 train 정규화 기준으로 수행
print("💫 It's okay, no biggie...")

💫 It's okay, no biggie...


train_min_max


test_min_max


# 싸이킷런에도 정규화 함수가 있다.
from sklearn.preprocessing import MinMaxScaler
train = [[10, -10], [30, 10], [50, 0]]
test = [[0, 1]]
scaler = MinMaxScaler()
print("👽 It's okay, no biggie.")

👽 It's okay, no biggie.


scaler.fit_transform(train)

array([[0. , 0. ],
       [0.5, 1. ],
       [1. , 0.5]])


scaler.transform(test)

array([[-0.25,  0.55]])


#trade 데이터의 국가명 컬럼 원본
print(trade['국가명'].head())  

# get_dummies를 통해 국가명 원-핫 인코딩
country = pd.get_dummies(trade['국가명'])
country.head()

0    중국
1    미국
2    일본
3    중국
4    미국
Name: 국가명, dtype: object


trade = pd.concat([trade, country], axis=1)
trade.head()


trade.drop(['국가명'], axis=1, inplace=True)
trade.head()


salary = pd.Series([4300, 8370, 1750, 3830, 1840, 4220, 3020, 2290, 4740, 4600, 
                    2860, 3400, 4800, 4470, 2440, 4530, 4850, 4850, 4760, 4500, 
                    4640, 3000, 1880, 4880, 2240, 4750, 2750, 2810, 3100, 4290, 
                    1540, 2870, 1780, 4670, 4150, 2010, 3580, 1610, 2930, 4300, 
                    2740, 1680, 3490, 4350, 1680, 6420, 8740, 8980, 9080, 3990, 
                    4960, 3700, 9600, 9330, 5600, 4100, 1770, 8280, 3120, 1950, 
                    4210, 2020, 3820, 3170, 6330, 2570, 6940, 8610, 5060, 6370,
                    9080, 3760, 8060, 2500, 4660, 1770, 9220, 3380, 2490, 3450, 
                    1960, 7210, 5810, 9450, 8910, 3470, 7350, 8410, 7520, 9610, 
                    5150, 2630, 5610, 2750, 7050, 3350, 9450, 7140, 4170, 3090])
print("👽 Almost there..")

👽 Almost there..


salary.hist()

<AxesSubplot:>


bins = [0, 2000, 4000, 6000, 8000, 10000]
print("👽 Almost there..")

# cut을 이용해 수치형데이터를 범주형 구간데이터로 변형
ctg = pd.cut(salary, bins=bins)
ctg

👽 Almost there..

0      (4000, 6000]
1     (8000, 10000]
2         (0, 2000]
3      (2000, 4000]
4         (0, 2000]
          ...      
95     (2000, 4000]
96    (8000, 10000]
97     (6000, 8000]
98     (4000, 6000]
99     (2000, 4000]
Length: 100, dtype: category
Categories (5, interval[int64]): [(0, 2000] < (2000, 4000] < (4000, 6000] < (6000, 8000] < (8000, 10000]]


print('salary[0]:', salary[0])
print('salary[0]가 속한 카테고리:', ctg[0])

salary[0]: 4300
salary[0]가 속한 카테고리: (4000, 6000]


# 구간별 속해있는 데이터수
ctg.value_counts().sort_index()

(0, 2000]        12
(2000, 4000]     34
(4000, 6000]     29
(6000, 8000]      9
(8000, 10000]    16
dtype: int64


# bins값으로 구간개수를 정해줘도 됨
ctg = pd.cut(salary, bins=6)
ctg

0      (4230.0, 5575.0]
1      (8265.0, 9610.0]
2     (1531.93, 2885.0]
3      (2885.0, 4230.0]
4     (1531.93, 2885.0]
            ...        
95     (2885.0, 4230.0]
96     (8265.0, 9610.0]
97     (6920.0, 8265.0]
98     (2885.0, 4230.0]
99     (2885.0, 4230.0]
Length: 100, dtype: category
Categories (6, interval[float64]): [(1531.93, 2885.0] < (2885.0, 4230.0] < (4230.0, 5575.0] < (5575.0, 6920.0] < (6920.0, 8265.0] < (8265.0, 9610.0]]


ctg.value_counts().sort_index()

(1531.93, 2885.0]    27
(2885.0, 4230.0]     24
(4230.0, 5575.0]     21
(5575.0, 6920.0]      6
(6920.0, 8265.0]      7
(8265.0, 9610.0]     15
dtype: int64


# qcut은 정해진 값이 아닌 비슷한 크기의 값들로 구분
ctg = pd.qcut(salary, q=5)
ctg

0       (3544.0, 4648.0]
1       (7068.0, 9610.0]
2     (1539.999, 2618.0]
3       (3544.0, 4648.0]
4     (1539.999, 2618.0]
             ...        
95      (2618.0, 3544.0]
96      (7068.0, 9610.0]
97      (7068.0, 9610.0]
98      (3544.0, 4648.0]
99      (2618.0, 3544.0]
Length: 100, dtype: category
Categories (5, interval[float64]): [(1539.999, 2618.0] < (2618.0, 3544.0] < (3544.0, 4648.0] < (4648.0, 7068.0] < (7068.0, 9610.0]]


print(ctg.value_counts().sort_index())
print(".\n.\n🛸 Well done!")

(1539.999, 2618.0]    20
(2618.0, 3544.0]      20
(3544.0, 4648.0]      20
(4648.0, 7068.0]      20
(7068.0, 9610.0]      20
dtype: int64
.
.
🛸 Well done!


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

csv_file_path = os.getenv('HOME')+'/aiffel/data_preprocess/data2/vgsales.csv'
vg = pd.read_csv(csv_file_path) 
vg.head()


# 결측치 찾기
print('전체 데이터 건수:', len(trade))
print('컬럼별 결측치 개수')
len(trade) - trade.count()

전체 데이터 건수: 195
컬럼별 결측치 개수

기간      0
수출건수    0
수출금액    0
수입건수    0
수입금액    0
무역수지    0
미국      0
일본      0
중국      0
dtype: int64


# 중복데이터 찾기
vg.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
16593    False
16594    False
16595    False
16596    False
16597    False
Length: 16598, dtype: bool


vg[vg.duplicated()]


# 이상치 찾기
vg.loc[outlier(vg, 'Global_Sales', 5)]


# 정규화
# vgsales 데이터를 Standardization 기법으로 정규화합니다. 
cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'] 
vg_Standardization= (vg[cols]-vg[cols].mean())/vg[cols].std()
vg_Standardization.head()


vg_Standardization.describe()


# 각 컬럼의 평균은 0으로, 분산은 1로 데이터를 바꿔줍니다.
fig, axs = plt.subplots(1,2, figsize=(12, 4),
                        gridspec_kw={'width_ratios': [2, 1]})

axs[0].scatter(vg['Global_Sales'], vg['Other_Sales'])
axs[0].set_xlim(-5, 15)
axs[0].set_ylim(-5, 5)
axs[0].axvline(c='grey', lw=1)
axs[0].axhline(c='grey', lw=1)
axs[0].set_title('Original Data')

axs[1].scatter(vg_Standardization['Global_Sales'], vg_Standardization['Other_Sales'])
axs[1].set_xlim(-5, 5)
axs[1].set_ylim(-5, 5)
axs[1].axvline(c='grey', lw=1)
axs[1].axhline(c='grey', lw=1)
axs[1].set_title('Data after standardization')

plt.show()


# 원핫인코딩
# vg데이터의 플랫폼
print(vg['Platform'].head())  

# get_dummies를 통해 플랫폼명 원-핫 인코딩
pf = pd.get_dummies(vg['Platform'])
pf.head()

0    Wii
1    NES
2    Wii
3    Wii
4     GB
Name: Platform, dtype: object


# 구간화
vgdata= vg['Global_Sales']
ctg = pd.cut(vgdata, bins=6)
ctg

0          (68.952, 82.74]
1         (27.587, 41.375]
2         (27.587, 41.375]
3         (27.587, 41.375]
4         (27.587, 41.375]
               ...        
16593    (-0.0727, 13.798]
16594    (-0.0727, 13.798]
16595    (-0.0727, 13.798]
16596    (-0.0727, 13.798]
16597    (-0.0727, 13.798]
Name: Global_Sales, Length: 16598, dtype: category
Categories (6, interval[float64]): [(-0.0727, 13.798] < (13.798, 27.587] < (27.587, 41.375] < (41.375, 55.163] < (55.163, 68.952] < (68.952, 82.74]]


bins = [0, 20, 40, 60, 80, 100]
ctg = pd.cut(vgdata, bins=bins)
ctg

0        (80, 100]
1         (40, 60]
2         (20, 40]
3         (20, 40]
4         (20, 40]
           ...    
16593      (0, 20]
16594      (0, 20]
16595      (0, 20]
16596      (0, 20]
16597      (0, 20]
Name: Global_Sales, Length: 16598, dtype: category
Categories (5, interval[int64]): [(0, 20] < (20, 40] < (40, 60] < (60, 80] < (80, 100]]


print(ctg.value_counts().sort_index())

(0, 20]      16578
(20, 40]        18
(40, 60]         1
(60, 80]         0
(80, 100]        1
Name: Global_Sales, dtype: int64

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지	기타사항
0	2015년 01월	중국	116932.0	12083947.0	334522.0	8143271.0	3940676.0	NaN
1	2015년 01월	미국	65888.0	5561545.0	509564.0	3625062.0	1936484.0	NaN
2	2015년 01월	일본	54017.0	2251307.0	82480.0	3827247.0	-1575940.0	NaN
3	2015년 02월	중국	86228.0	9927642.0	209100.0	6980874.0	2946768.0	NaN
4	2015년 02월	미국	60225.0	5021264.0	428678.0	2998216.0	2023048.0	NaN

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
0	2015년 01월	중국	116932.0	12083947.0	334522.0	8143271.0	3940676.0
1	2015년 01월	미국	65888.0	5561545.0	509564.0	3625062.0	1936484.0
2	2015년 01월	일본	54017.0	2251307.0	82480.0	3827247.0	-1575940.0
3	2015년 02월	중국	86228.0	9927642.0	209100.0	6980874.0	2946768.0
4	2015년 02월	미국	60225.0	5021264.0	428678.0	2998216.0	2023048.0

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
191	2020년 04월	미국	105360.0	NaN	1141118.0	5038739.0	NaN
196	2020년 06월	중국	NaN	NaN	NaN	NaN	NaN
197	2020년 06월	미국	NaN	NaN	NaN	NaN	NaN
198	2020년 06월	일본	NaN	NaN	NaN	NaN	NaN

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
188	2020년 03월	미국	97117.0	7292838.0	1368345.0	5388338.0	1904500.0
191	2020년 04월	미국	105360.0	NaN	1141118.0	5038739.0	NaN
194	2020년 05월	미국	126598.0	4600726.0	1157163.0	4286873.0	313853.0

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
6	2015년 03월	중국	117529.0	11868032.0	234321.0	7226911.0	4641121.0
75	2017년 02월	중국	159062.0	11118131.0	188555.0	6600637.0	4517495.0
80	2017년 03월	일본	65093.0	2395932.0	165734.0	5157589.0	-2761657.0
96	2017년 09월	중국	183442.0	13540683.0	295443.0	8443414.0	5097269.0
99	2017년 10월	중국	137873.0	12580474.0	244977.0	7932403.0	4648071.0
101	2017년 10월	일본	63510.0	1847999.0	127696.0	4418583.0	-2570584.0
102	2017년 11월	중국	421194.0	14000887.0	307790.0	9253318.0	4747569.0
105	2017년 12월	중국	218114.0	13848364.0	290347.0	8600132.0	5248232.0
114	2018년 03월	중국	232396.0	13576667.0	267249.0	8412516.0	5164151.0
116	2018년 03월	일본	80142.0	2603450.0	159601.0	5226141.0	-2622691.0
120	2018년 05월	중국	214145.0	13851900.0	307183.0	9279720.0	4572180.0
123	2018년 06월	중국	257130.0	13814241.0	279023.0	8713018.0	5101223.0
126	2018년 07월	중국	181772.0	13721233.0	293164.0	8869278.0	4851955.0
129	2018년 08월	중국	199010.0	14401521.0	280033.0	8525532.0	5875989.0
132	2018년 09월	중국	171328.0	14590529.0	280337.0	7889890.0	6700639.0
135	2018년 10월	중국	169809.0	14767041.0	319876.0	9963108.0	4803932.0

A.I

A.I

데이터 전처리 본문

데이터 전처리

데이터 전처리¶

1. 결측치¶

수치형 데이터를 보완하는 방법¶

범주형 데이터를 보완하는 방법¶

2. 중복된 데이터 제거¶

3. 이상치(Outlier)¶

Z-SCORE METHOD¶

IQR method¶

4. 정규화¶

5. 원-핫 인코딩¶

6. 구간화¶

데이터셋¶

연습¶

'파이썬 & AI 학습' 카테고리의 다른 글

티스토리툴바

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
0	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...
194	False	False	False	False	False	False	False
195	False	False	False	False	False	False	False
196	False	False	True	True	True	True	True
197	False	False	True	True	True	True	True
198	False	False	True	True	True	True	True

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
186	2020년 03월	중국	248059.0	10658599.0	358234.0	8948918.0	1709682.0
187	2020년 03월	중국	248059.0	10658599.0	358234.0	8948918.0	1709682.0

	A	B
0	-3.075383	-0.448552
1	4.302209	-2.151012
2	-0.522519	-1.245304
3	1.394279	-1.169169
4	0.427537	-0.458080
...	...	...
95	13.250265	-0.734271
96	0.137697	-2.041710
97	2.614901	-3.057545
98	7.866735	-2.140529
99	0.297992	-0.503868

	A	B
0	-1.862058	0.559307
1	0.072383	-1.132240
2	-1.192684	-0.232338
3	-0.690090	-0.156691
4	-0.943575	0.549839
...	...	...
95	2.418607	0.275419
96	-1.019573	-1.023639
97	-0.370038	-2.032961
98	1.007019	-1.121824
99	-0.977542	0.504345

	A	B
0	0.012283	0.607216
1	0.458634	0.267692
2	0.166733	0.448318
3	0.282702	0.463502
4	0.224213	0.605316
...	...	...
95	1.000000	0.550235
96	0.206677	0.289490
97	0.356550	0.086901
98	0.674291	0.269782
99	0.216375	0.596184

	수출건수	수출금액	수입건수	수입금액	무역수지
0	-0.007488	1.398931	-0.163593	1.283660	1.256342
1	-0.689278	-0.252848	0.412529	-0.964444	0.401088
2	-0.847838	-1.091156	-0.993148	-0.863844	-1.097779
3	-0.417598	0.852853	-0.576399	0.705292	0.832209
4	-0.764918	-0.389673	0.146306	-1.276341	0.438027

	수출건수	수출금액	수입건수	수입금액	무역수지
count	1.950000e+02	1.950000e+02	1.950000e+02	1.950000e+02	1.950000e+02
mean	-1.019128e-16	5.921189e-17	6.091993e-17	-8.312439e-17	3.074464e-17
std	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00
min	-9.194976e-01	-1.231761e+00	-9.984408e-01	-1.276341e+00	-1.603764e+00
25%	-5.937426e-01	-1.041338e+00	-7.673625e-01	-7.911669e-01	-1.116765e+00
50%	-4.373265e-01	-1.564700e-01	-3.429346e-01	-4.137392e-01	1.426824e-01
75%	4.420459e-01	1.037200e+00	3.927781e-01	8.827841e-01	7.461637e-01
max	5.486317e+00	2.078416e+00	3.239068e+00	2.376092e+00	2.434109e+00

	기간	국가명	수출건수	수출금액	수입건수	수입금액	무역수지
0	2015년 01월	중국	0.142372	0.794728	0.197014	0.700903	0.708320
1	2015년 01월	미국	0.035939	0.295728	0.332972	0.085394	0.496512
2	2015년 01월	일본	0.011187	0.042477	0.001249	0.112938	0.125310
3	2015년 02월	중국	0.078351	0.629759	0.099597	0.542551	0.603281
4	2015년 02월	미국	0.024131	0.254394	0.270146	0.000000	0.505660

	수출건수	수출금액	수입건수	수입금액	무역수지
count	195.000000	195.000000	195.000000	195.000000	195.000000
mean	0.143541	0.372113	0.235620	0.349450	0.397180
std	0.156108	0.302099	0.235988	0.273790	0.247655
min	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.050853	0.057527	0.054532	0.132836	0.120608
50%	0.075271	0.324844	0.154691	0.236172	0.432516
75%	0.212548	0.685450	0.328311	0.591147	0.581972
max	1.000000	1.000000	1.000000	1.000000	1.000000

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
0	1	Wii Sports	Wii	2006.0	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74
1	2	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24
2	3	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82
3	4	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00
4	5	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37

« 2024/07 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31

	0	1
0	0.0	0.0
1	0.5	1.0
2	1.0	0.5

	0	1
0	-0.25	0.55
1	0.00	1.00

	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
0	50.478988	57.135209	11.937698	44.604742	52.862433
1	35.283374	6.793984	21.766640	3.828109	25.531734
2	19.083698	25.197026	12.002362	17.296594	22.689341
3	18.961252	21.496629	10.353428	15.440700	20.875869
4	13.475647	17.301527	32.791869	5.047696	19.827656

	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
count	1.659800e+04	1.659800e+04	1.659800e+04	1.659800e+04	1.659800e+04
mean	-3.145335e-14	-6.160151e-14	1.270088e-14	4.582867e-14	-5.162491e-14
std	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00
min	-3.240761e-01	-2.901982e-01	-2.514840e-01	-2.548567e-01	-3.391840e-01
25%	-3.240761e-01	-2.901982e-01	-2.514840e-01	-2.548567e-01	-3.070303e-01
50%	-2.261189e-01	-2.506217e-01	-2.514840e-01	-2.018312e-01	-2.362920e-01
75%	-3.020441e-02	-7.252779e-02	-1.221558e-01	-4.275459e-02	-4.336942e-02
max	5.047899e+01	5.713521e+01	3.279187e+01	5.579313e+01	5.286243e+01

	GB	...	Wii
0	0	...	1
1	0	...	0
2	0	...	1
3	0	...	1
4	1	...	0

데이터 가져오기 (0)	2021.02.15
선형회귀와 로지스틱회귀 (0)	2021.02.06
파이썬 문법 (0)	2021.02.01
비지도 학습( Unsupervised Learning ) (0)	2021.01.29
포켓몬 찾기 (0)	2021.01.27

	GB	...	Wii
0	0	...	1
1	0	...	0
2	0	...	1
3	0	...	1
4	1	...	0