A.I
파이썬 데이터베이스 만들기 Pandas 본문
데이터베이스 관리¶
- mkdir -p ~/aiffel/data_handling/data
- wget https://aiffelstaticprd.blob.core.windows.net/media/documents/db_data.zip
- mv db_data.zip ~/aiffel/data_handling/data
- cd ~/aiffel/data_handling/data
- unzip db_data.zip
데이터 관리 프로그램 만들기 - 월급이 가장 높은 사원 찾기¶
In [1]:
# 텍스트 파일에서 불러온 각각의 데이터는 string이므로 형 변환시켜야함
class Employee:
def __init__ (self, name, division, jobgrade, years, basic):
self.name = name
self.division = division
self.jobgrade = int(jobgrade)
self.years = int(years)
self.basic = float(basic)
print('슝=3')
슝=3
In [2]:
employee1 = Employee('Ahn TaeHee', 'AI Team', '3', '3', '300')
In [3]:
def make_employee(inputdata):
name, division, jopgrade, year, basic = inputdata.split(',')
return Employee(name, division, jopgrade, year, basic)
print('슝=3')
슝=3
In [4]:
import os
file_path = os.getenv("HOME") + '/aiffel/data_handling/data/employeefile.txt'
print(file_path)
inputfile = open(file_path, 'r')
for i in inputfile:
e = make_employee(i)
/home/ssac24/aiffel/data_handling/data/employeefile.txt
In [5]:
class Employee:
def get_name(self):
return self.name
def get_division(self):
return self.division
def get_jobgrade(self):
return self.jobgrade
def get_years(self):
return self.years
def get_basic(self):
return self.basic
print('슝=3')
슝=3
In [6]:
def get_salary(self):
self.salary = self.basic + (self.basic * self.years * 0.1)
return self.salary
완성된 클래스 코드¶
In [7]:
import os
class Employee:
def __init__ (self, name, division, jobgrade, years, basic):
self.name = name
self.division = division
self.jobgrade = int(jobgrade)
self.years = int(years)
self.basic = float(basic)
self.salary = 0
def get_name(self):
return self.name
def get_division(self):
return self.division
def get_jobgrade(self):
return self.jobgrade
def get_years(self):
return self.years
def get_basic(self):
return self.basic
def get_salary(self):
self.salary = self.basic + (self.basic * self.years * 0.1)
return self.salary
def make_employee(inputdata):
name, division, jopgrade, year, basic = inputdata.split(',')
return Employee(name, division, jopgrade, year, basic)
def summarize():
file_path = os.getenv("HOME") + '/aiffel/data_handling/data/employeefile.txt'
inputfile = open(file_path, 'r')
highest = make_employee(inputfile.readline())
for l in inputfile:
e = make_employee(l)
if e.get_salary() > highest.get_salary():
highest = e
inputfile.close()
print("The name of the highest salary employee is: ", highest.get_name())
print("Years of service: ", highest.get_years())
print("Division: ", highest.get_division())
print("Salay: ", highest.get_salary())
print('슝=3')
슝=3
In [8]:
summarize()
The name of the highest salary employee is: Kim YoungJae Years of service: 7 Division: Sales Salay: 1020.0
pd.merge()¶
In [1]:
import pandas as pd
df1 = pd.DataFrame({'Student': ['KimTaemin','HaJaehwa','JungSayoung','Sonjimin','Leesoomin','KangJun'],
'Korean': [90, 85, 88, 35, 40, 44],
'English': [80, 90, 40, 44, 55, 90]})
df2 = pd.DataFrame({'Student': ['KimTaemin','HaJaehwa','JungSayoung','Sonjimin','Leesoomin','KangJun'],
'Math': [100, 55, 38, 43, 68, 82]})
print(df1)
print('---')
print(df2)
Student Korean English 0 KimTaemin 90 80 1 HaJaehwa 85 90 2 JungSayoung 88 40 3 Sonjimin 35 44 4 Leesoomin 40 55 5 KangJun 44 90 --- Student Math 0 KimTaemin 100 1 HaJaehwa 55 2 JungSayoung 38 3 Sonjimin 43 4 Leesoomin 68 5 KangJun 82
In [2]:
pd.merge(df1, df2)
Out[2]:
Student | Korean | English | Math | |
---|---|---|---|---|
0 | KimTaemin | 90 | 80 | 100 |
1 | HaJaehwa | 85 | 90 | 55 |
2 | JungSayoung | 88 | 40 | 38 |
3 | Sonjimin | 35 | 44 | 43 |
4 | Leesoomin | 40 | 55 | 68 |
5 | KangJun | 44 | 90 | 82 |
In [3]:
# on에 키값(공통 컬럼명)을 넣어 확실하게 표시할수 있다.
pd.merge(df1, df2, on='Student')
Out[3]:
Student | Korean | English | Math | |
---|---|---|---|---|
0 | KimTaemin | 90 | 80 | 100 |
1 | HaJaehwa | 85 | 90 | 55 |
2 | JungSayoung | 88 | 40 | 38 |
3 | Sonjimin | 35 | 44 | 43 |
4 | Leesoomin | 40 | 55 | 68 |
5 | KangJun | 44 | 90 | 82 |
# pd.merge(how='inner or outer')¶
In [4]:
df1 = pd.DataFrame({'Student': ['KimTaemin','HaJaehwa','JungSayoung','Sonjimin','Leesoomin','KangJun'],
'Korean': [90, 85, 88, 35, 40, 44],
'English': [80, 90, 40, 44, 55, 90]})
df2 = pd.DataFrame({'Student': ['Jiyoungmin','KimTaemin'],
'Math':[44,33]})
print(df1)
print('---')
print(df2)
Student Korean English 0 KimTaemin 90 80 1 HaJaehwa 85 90 2 JungSayoung 88 40 3 Sonjimin 35 44 4 Leesoomin 40 55 5 KangJun 44 90 --- Student Math 0 Jiyoungmin 44 1 KimTaemin 33
In [5]:
pd.merge(df1, df2, how='inner')
Out[5]:
Student | Korean | English | Math | |
---|---|---|---|---|
0 | KimTaemin | 90 | 80 | 33 |
In [6]:
pd.merge(df1, df2)
Out[6]:
Student | Korean | English | Math | |
---|---|---|---|---|
0 | KimTaemin | 90 | 80 | 33 |
In [7]:
pd.merge(df1, df2, how='outer')
Out[7]:
Student | Korean | English | Math | |
---|---|---|---|---|
0 | KimTaemin | 90.0 | 80.0 | 33.0 |
1 | HaJaehwa | 85.0 | 90.0 | NaN |
2 | JungSayoung | 88.0 | 40.0 | NaN |
3 | Sonjimin | 35.0 | 44.0 | NaN |
4 | Leesoomin | 40.0 | 55.0 | NaN |
5 | KangJun | 44.0 | 90.0 | NaN |
6 | Jiyoungmin | NaN | NaN | 44.0 |
df.join()¶
In [8]:
df1.join(df2, how='outer', lsuffix='_caller', rsuffix='_other') # '_caller'인 df1 컬럼이 왼쪽에 가도록 배치
Out[8]:
Student_caller | Korean | English | Student_other | Math | |
---|---|---|---|---|---|
0 | KimTaemin | 90 | 80 | Jiyoungmin | 44.0 |
1 | HaJaehwa | 85 | 90 | KimTaemin | 33.0 |
2 | JungSayoung | 88 | 40 | NaN | NaN |
3 | Sonjimin | 35 | 44 | NaN | NaN |
4 | Leesoomin | 40 | 55 | NaN | NaN |
5 | KangJun | 44 | 90 | NaN | NaN |
df.concat() - 데이터 행 합치기¶
In [9]:
df1 = pd.DataFrame({'Student': ['KimTaemin','HaJaehwa','JungSayoung','Sonjimin','Leesoomin','KangJun'],
'Korean': [90, 85, 88, 35, 40, 44],
'English': [80, 90, 40, 44, 55, 90]})
df2 = pd.DataFrame({'Student': ['Jiyoungmin','LeeJae','KimJaehee'],
'Korean': [44,73,100]})
print(df1)
print(df2)
Student Korean English 0 KimTaemin 90 80 1 HaJaehwa 85 90 2 JungSayoung 88 40 3 Sonjimin 35 44 4 Leesoomin 40 55 5 KangJun 44 90 Student Korean 0 Jiyoungmin 44 1 LeeJae 73 2 KimJaehee 100
In [10]:
pd.concat([df1,df2], sort=False)
Out[10]:
Student | Korean | English | |
---|---|---|---|
0 | KimTaemin | 90 | 80.0 |
1 | HaJaehwa | 85 | 90.0 |
2 | JungSayoung | 88 | 40.0 |
3 | Sonjimin | 35 | 44.0 |
4 | Leesoomin | 40 | 55.0 |
5 | KangJun | 44 | 90.0 |
0 | Jiyoungmin | 44 | NaN |
1 | LeeJae | 73 | NaN |
2 | KimJaehee | 100 | NaN |
In [12]:
pd.concat([df1, df2], sort=False, ignore_index=True)
Out[12]:
Student | Korean | English | |
---|---|---|---|
0 | KimTaemin | 90 | 80.0 |
1 | HaJaehwa | 85 | 90.0 |
2 | JungSayoung | 88 | 40.0 |
3 | Sonjimin | 35 | 44.0 |
4 | Leesoomin | 40 | 55.0 |
5 | KangJun | 44 | 90.0 |
6 | Jiyoungmin | 44 | NaN |
7 | LeeJae | 73 | NaN |
8 | KimJaehee | 100 | NaN |
필터링 연산¶
In [13]:
df = pd.DataFrame({"A": [1,4,7], "B": [2,5,8], "C":[3,6,9]})
df
Out[13]:
A | B | C | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
In [14]:
df['A']
Out[14]:
0 1 1 4 2 7 Name: A, dtype: int64
In [15]:
# df.loc[[행],[열]]
print(df.loc[0])
A 1 B 2 C 3 Name: 0, dtype: int64
In [16]:
print(df.loc[0, 'B'])
2
In [17]:
print(df.loc[:, 'A'])
0 1 1 4 2 7 Name: A, dtype: int64
In [18]:
# df.iloc[[행],[열]]
print(df.iloc[0])
A 1 B 2 C 3 Name: 0, dtype: int64
In [19]:
print(df.iloc[:, 0])
0 1 1 4 2 7 Name: A, dtype: int64
In [21]:
print(df.iloc[0, 1])
2
그룹연산: groupby(), apply()¶
In [22]:
df = pd.DataFrame({'Columns1':['A','A','B','B','C','C','A','B'],
'Columns2':[10, 2, 30, -6, 8, 9, 5, 2]})
df
Out[22]:
Columns1 | Columns2 | |
---|---|---|
0 | A | 10 |
1 | A | 2 |
2 | B | 30 |
3 | B | -6 |
4 | C | 8 |
5 | C | 9 |
6 | A | 5 |
7 | B | 2 |
In [23]:
import numpy as np
df.groupby(['Columns1']).max().apply(np.sqrt)
Out[23]:
Columns2 | |
---|---|
Columns1 | |
A | 3.162278 |
B | 5.477226 |
C | 3.000000 |
Pandas Transform¶
In [25]:
import pandas as pd
import numpy as np
import os
file_path = os.getenv("HOME") + '/aiffel/data_handling/data/Salaries.csv'
sal = pd.read_csv(file_path)
sal.head(5)
/home/ssac24/anaconda3/envs/aiffel/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3156: DtypeWarning: Columns (12) have mixed types.Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
Out[25]:
Id | EmployeeName | JobTitle | BasePay | OvertimePay | OtherPay | Benefits | TotalPay | TotalPayBenefits | Year | Notes | Agency | Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | NATHANIEL FORD | GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 167411.18 | 0.00 | 400184.25 | NaN | 567595.43 | 567595.43 | 2011 | NaN | San Francisco | NaN |
1 | 2 | GARY JIMENEZ | CAPTAIN III (POLICE DEPARTMENT) | 155966.02 | 245131.88 | 137811.38 | NaN | 538909.28 | 538909.28 | 2011 | NaN | San Francisco | NaN |
2 | 3 | ALBERT PARDINI | CAPTAIN III (POLICE DEPARTMENT) | 212739.13 | 106088.18 | 16452.60 | NaN | 335279.91 | 335279.91 | 2011 | NaN | San Francisco | NaN |
3 | 4 | CHRISTOPHER CHONG | WIRE ROPE CABLE MAINTENANCE MECHANIC | 77916.00 | 56120.71 | 198306.90 | NaN | 332343.61 | 332343.61 | 2011 | NaN | San Francisco | NaN |
4 | 5 | PATRICK GARDNER | DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 134401.60 | 9737.00 | 182234.59 | NaN | 326373.19 | 326373.19 | 2011 | NaN | San Francisco | NaN |
In [26]:
sal['BasePay'].mean()
Out[26]:
66327.68895967308
In [27]:
sal['OvertimePay'].max()
Out[27]:
245131.88
In [28]:
sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['JobTitle']
Out[28]:
24 CAPTAIN, FIRE SUPPRESSION Name: JobTitle, dtype: object
In [29]:
sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['TotalPayBenefits']
Out[29]:
24 270324.91 Name: TotalPayBenefits, dtype: float64
In [30]:
sal[sal['TotalPayBenefits']==sal['TotalPayBenefits'].max()]['EmployeeName']
Out[30]:
0 NATHANIEL FORD Name: EmployeeName, dtype: object
In [31]:
sal[sal['TotalPayBenefits']==sal['TotalPayBenefits'].min()]
Out[31]:
Id | EmployeeName | JobTitle | BasePay | OvertimePay | OtherPay | Benefits | TotalPay | TotalPayBenefits | Year | Notes | Agency | Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
110530 | 110531 | David P Kucia | Police Officer 3 | NaN | 0.0 | 0.0 | -33.89 | 0.0 | -33.89 | 2013 | NaN | San Francisco | NaN |
In [32]:
sal.groupby('Year').mean()['BasePay']
Out[32]:
Year 2011 63595.956517 2012 65436.406857 2013 69630.030216 2014 66573.154204 Name: BasePay, dtype: float64
In [33]:
sal['JobTitle'].nunique()
Out[33]:
2158
In [34]:
sal['JobTitle'].value_counts().head(5)
Out[34]:
Transit Operator 7036 Special Nurse 4389 Registered Nurse 3736 Public Svc Aide-Public Works 2518 Police Officer 3 2421 Name: JobTitle, dtype: int64
In [35]:
def chief_string(title):
if 'chief' in title.lower().split():
return True
else:
return False
sum(sal['JobTitle'].apply(lambda x : chief_string(x)))
Out[35]:
477
다중 사용자 환경¶
In [36]:
C_account = 100 # 현재 C 고객 계좌 잔고는 100
C_account_by_A_process = 100 # A 고객 응대 프로그램이 C 고객 계좌 잔고를 조회했습니다
C_account_by_B_process = 100 # B 고객 응대 프로그램이 C 고객 계좌 잔고를 조회했습니다
C_account_by_A_process = C_account_by_A_process + 50 # A 고객이 50을 입금했습니다.
C_account_by_B_process = C_account_by_B_process + 30 # B 고객이 30을 입금했습니다.
C_account = C_account_by_A_process # A 고객의 입금이 C 고객 계좌 잔고에 반영되었습니다.
C_account = C_account_by_B_process # B 고객의 입금이 C 고객 계좌 잔고에 반영되었습니다.
print(C_account) # C 고객 계좌 잔고는 얼마일까요?
130
SQL¶
- #### DDL
- #### DML
파이썬 DB-API¶
- connect(): 데이터베이스의 연결을 만든다.
- cursor(): 질의를 관리하기 위한 커서 객체를 만든다. (file의 open과 비슷)
- execute(), excutemany(): 데이터베이스에 하나 이상의 SQL명령을 실행한다.
- fetchone(), fetchmany(), fetchall(): 실행 결과를 얻는다.
In [43]:
import sqlite3
conn = sqlite3.connect('example.db')
In [44]:
# 실제 접속 시 필요한 구문
#import MySQLdb
#conn = MySQLdb.connect(host='localhost', port=3308, db='example.db', user='root',
#passwd='1234')
In [45]:
# 커서 객체 만들기
c = conn.cursor()
In [46]:
# Create table
c.execute('''CREATE TABLE stocks2
(date text, trans text, symbol text, qty real, price real)''')
# Insert a row of data
c.execute("INSERT INTO stocks2 VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
Out[46]:
<sqlite3.Cursor at 0x7fcd985da1f0>
In [47]:
# 데이터베이스 반영
conn.commit()
In [48]:
conn.close()
1단계. 메모리에 SQLite3 데이터베이스를 만들고 4가지 속성을 지닌 sales 테이블을 만든다.¶
In [49]:
import sqlite3
conn = sqlite3.connect(':memory:')
query = """CREATE TABLE sales
(product VARCHAR(20),
maker VARCHAR(40),
amount FLOAT,
date DATE);"""
conn.execute(query)
conn.commit()
2단계. sales 테이블에 데이터를 삽입한다.¶
In [50]:
data = [('smart phone', 'Apple', '47.2', '2019-08-03'),
('printer', 'Samsung', '15.2', '2018-02-10',),
('monitor', 'LG', '14.25', '2019-12-20'),
('python book', 'Amazon', '12.40', '2020-01-23')]
statement = "INSERT INTO sales VALUES(?, ?, ?, ?)"
conn.executemany(statement, data)
conn.commit()
3단계. sales 테이블에 질의한다.¶
In [51]:
cursor = conn.execute("SELECT * FROM sales")
rows = cursor.fetchall()
4단계. 출력 데이터의 개수를 샌다.¶
In [52]:
row_counter = 0
for row in rows:
print(row)
row_counter +=1
print("Number of rows: {}".format(row_counter))
('smart phone', 'Apple', 47.2, '2019-08-03') ('printer', 'Samsung', 15.2, '2018-02-10') ('monitor', 'LG', 14.25, '2019-12-20') ('python book', 'Amazon', 12.4, '2020-01-23') Number of rows: 4
In [53]:
# 전체코드
import sqlite3
conn = sqlite3.connect(':memory:')
query = """CREATE TABLE sales
(product VARCHAR(20),
maker VARCHAR(40),
amount FLOAT,
date DATE);"""
conn.execute(query)
conn.commit()
data = [('smart phone', 'Apple', '47.2', '2019-08-03'),
('printer', 'Samsung', '15.2', '2018-02-10',),
('monitor', 'LG', '14.25', '2019-12-20'),
('python book', 'Amazon', '12.40', '2020-01-23')]
statement = "INSERT INTO sales VALUES(?, ?, ?, ?)"
conn.executemany(statement, data)
conn.commit()
cursor = conn.execute("SELECT * FROM sales")
rows = cursor.fetchall()
row_counter = 0
for row in rows:
print(row)
row_counter +=1
print("Number of rows: {}".format(row_counter))
('smart phone', 'Apple', 47.2, '2019-08-03') ('printer', 'Samsung', 15.2, '2018-02-10') ('monitor', 'LG', 14.25, '2019-12-20') ('python book', 'Amazon', 12.4, '2020-01-23') Number of rows: 4
'파이썬 & AI 학습' 카테고리의 다른 글
활성화 함수의 이해 (0) | 2021.03.17 |
---|---|
딥러닝 레이어에 대한 이해 (0) | 2021.03.12 |
Hadoop & Spark (0) | 2021.03.10 |
Linear, Convolution layer (0) | 2021.03.06 |
MapReduce 원리 (0) | 2021.03.04 |