본문 바로가기

Python

Python - 변수저장, 데이터프레임 필터(조회기능), 데이터프레임 병합

import seaborn as sns

# titanic 데이터를 가져와서 titanic 변수에 저장하기 
titanic = sns.load_dataset('titanic')
titanic 

# titanic 데이터셋에서 age, fare 2개 열을 선택하여 데이터프레임 df 만들기
import pandas as pd
df = titanic.loc[:,['age','fare']]
df

# df 데이터셋에 ten 컬럼에 10을 넣어서 추가하기
df['ten'] = 10 
print(df.head())

# 10을 더하여 저장 함수 add_10를 생성하기
def add_10(x):
    return x + 10

# 두개의 변수 더하여 저장하는 add_two_obj 함수
def add_two_obj(a,b):
    return a+b    

print(add_10(100))
print(add_two_obj(100,10))

# apply : 시리즈 데이터에 함수 적용하기
#       : 시리즈 데이터 각각의 요소에 add_10 함수를 적용
sr1 = df['age'].apply(add_10)
print(df['age'].head())
print(sr1.head())
print(type(sr1)) #시리즈 객체

sr2 = df['age'].apply(add_two_obj,b=10)
print(sr2.head())
print(type(sr2)) #시리즈 객체 

# 람다 방식을 이용하여 add_10 함수 실행하기 
sr3 = df['age'].apply(lambda x : add_10(x))
print(sr3.head())
print(type(sr3)) #시리즈 객체 

# applymap :
# apply 함수에 시리즈 객체를 적용하여, 데이터 프레임 객체로 생성
df_map = df.applymap(add_10)
print(df_map.head())


def missing_value(series) :
    return series.isnull() #불린 시리즈를 반환 

#
result = df.apply(missing_value, axis=0)
print(result.head())
print(type(result))

# DataFrame.apply (적용함수, axis = 0) # 행 기준. 열 값 = 시리즈 객체 
result = df.apply(missing_value, axis=0)
print(df.head())
print(result.head())
print(type(result))
print(df.info())

# 모든 시리즈 객체에 최대값과 최소값의 차이를 출력하기
def max_min(s) : 
    return s.max() - s.min() # 값으로 반환
result = df.apply(max_min, axis=0)
print(result.head())
print(type(result))

# add_two_obj 함수 : 두개의 매개변수를 받아서 합을 리턴
# age 컬럼과 ten 컬럼의 합을 add 컬럼에 저장
df["add"] = df["age"] + df["ten"]
print(df.head())    

# age 컬럼과 ten 컬럼의 합을 add2 컬럼에 저장 
# add_two_obj 함수
df['add2'] = df.apply(lambda x: add_two_obj\
                      (x['age'],x['ten']), axis=1) #0은 행 1은 열
print(df.head())

#
df['add'] = df['age'].apply(add_two_obj, df['ten'])
sr2 = df['age'].apply(add_two_obj, b=df['ten'])
sr2.head()

sr2 = df['age'].apply(add_two_obj,b=df['ten'])
print(sr2.head())


# 각 열의 Nan 개수 반환 - 데이터프레임 전달하면 시리즈 반환 
def missing_count(x) :
    return missing_value(x).sum()

# 데이터 프레임의 총 Nan 개수 - 데이터프레임 전달하면 값을 반환
def total_number_missing(x):
    return missing_count(x).sum() 


result = df.apply(missing_value)
print(result.head())
print(type(result))


result = df.apply(missing_count)
print(result.head())
print(type(result))


result = df.pipe(missing_count)
print(result.head())
print(type(result))

#
result = df.apply(total_number_missing)
print(result.head())
print(type(result))

#
result = df.pipe(total_number_missing)
print(result)
print(type(result))

### 데이터 프레임의 컬럼 관련 예제
titanic = sns.load_dataset('titanic')
df = titanic.loc[0:4, 'survived':'age']
print(df)

### 열 이름의 리스트 만들기
print(df.columns)
print(df.columns.values)
columns = list(df.columns)
print(columns)
print(type(columns))

print(df.columns.values)
print(type(df.columns.values))

# 배열 객체를 list로 변환하기 
columns = list(df.columns.values)
print(columns)
print(type(columns))

# 열 이름을 알파벳 순으로 정렬하기
columns_sorted = sorted(columns) #알파벳 순으로 정렬
print(columns_sorted)

# 컬렴명 순으로 조회하기
df_sorted = df[columns_sorted]
print(df_sorted)

# 열 이름을 알파벳 역순으로 정렬하기 
columns_reversed = list(reversed(columns_sorted))
print(columns_reversed)
df_reversed = df[columns_reverse]
print(df_reversed)

# 컬럼명 역순으로 조회하기
df_sorted = df[columns_reversed]
print(df_sorted)

# 열 이름을 사용자가 정의한 임의의 순서로 재배치하기
columns_customed = ["pclass",'sex','age','survived']
df_customed = df[columns_customed]
print(df_customed)

# 주가데이터 df 데이터프레임으로 저장하기 
df = pd.read_excel("주가데이터.xlsx",engine="openpyxl")
print(df)
print(df.dtypes)

주가데이터.xlsx
0.01MB


# 연월일 컬럼으로 df['년'],df['월'],df['일'] 
df['년'] = df["연월일"].dt.year
df['월'] = df["연월일"].dt.month
df['일'] = df["연월일"].dt.day
print(df.head())
print(df.dtypes)


df['연월일2'] = df['연월일'].astype('str')
print(df.dtypes)
print(df['연월일2'])

# 문자열을 split() 메서드로 분리
# 문자열의 멤버함수인 split()
# .str : 문자열로 형변환
dates = df['연월일2'].str.split('-')
print(type(df['연월일2']))
print(df["연월일2"].dtype)
print(dates)
df['년2'] = dates.str.get(0)
df['월2'] = dates.str.get(1)
df['일2'] = dates.str.get(2)
print(df.dtypes)
print(df.head())

# 필터 : 조건에 맞는 데이터를 조회
titanic = sns.load_dataset('titanic')
titanic

# 타이타닉 승객중 10대(10~19세)인 승객만 조회하기
mask1 = (titanic.age >= 10) & (titanic.age < 20)
print(mask1.head())
df_teenage = titanic.loc[mask1,:]
print(df_teenage.head())

# 나이가 10세 미만 (0~9세)이고 여성인 승객만 따로 선택
mask2 = (titanic.age < 10) & (titanic.sex == 'female')
df_female_under10 = titanic.loc[mask2,:]
print(df_female_under10)

# 나이가 10세 미만 이거나 60세 이상 승객의 age,sex,alone만 조호히가ㅣ
mask3 = (titanic.age < 10) | (titanic.age >= 60)
df_under10_morethan60 = titanic.loc[mask2,['age','sex','alone']]
print(df_under10_morethan60)

# 동행자의 수가 3,4,5인 승객
mask4 = (titanic.sibsp) >=3 & (titanic.sibsp <= 5) 
titanic.loc[mask4,:]

# isin() 함수 사용 
mask5 = titanic["sibsp"].isin([3,4,5])
titanic.loc[mask5,:]

print(titanic.info())
print(titanic["class"])

# class 컬럼 중 First, Second인 행만 조회하기
df_12 = titanic[titanic['class'].isin(["First","Second"])]
print(df_12.head())

# 데이터 프레임 병합
import pandas as pd
df1 = pd.DataFrame({'a':['a0','a1','a2','a3'],
                    'b':['b0','b1','b2','b3'],
                    'c':['c0','c1','c2','c3']},
                         index=[0,1,2,3])
df2 = pd.DataFrame({'a':['a2','a3','a4','a5'],
                    'b':['b2','b3','b4','b5'],
                    'c':['c2','c3','c4','c5'],
                    'd':['d2','d3','d4','d5']},
                         index=[2,3,4,5])
print(df1)
print(df2)

# 행기준으로 연결. 컬럼명을 기준으로 연결 
result1 = pd.concat([df1,df2])
print(result1)

# 행기준으로 연결 인덱스이름을 다시 설정
result2 = pd.concat([df1,df2],ignore_index=True)
print(result2)

# 열 기준으로 연결하기. 인덱스명 기준 연결.
# 만약 존재하지 않는 인덱스가 있는 경우 Nan 값으로 설정.
# join='outer' : 기본값.
result3 = pd.concat([df1,df2],axis=1, join='outer')
print(result3)

# 열기준 연결. 교집합만 출력. 인덱스 기준
result4 = pd.concat([df1,df2],axis=1, join='inner')
print(result4)

# 시리즈로 데이터 프레임 객체 생성하기
sr1 = pd.Series(['e0','e1','e2','e3'], name='e')
sr2 = pd.Series(['f0','f1','f2'], name='f',index=[3,4,5])
sr3 = pd.Series(['g0','g1','g2','g3'], name='g')

print(sr1)
print(sr2)
print(sr3)

# df1과 sr1 두개의 데이터를 열기준 연결하기. index이름 기준.
result5 = pd.concat([df1,sr1],axis=1)
print(result5)
 
# df2와 sr2 두개의 데이터를 열기준 연결하기. 
result6 = pd.concat([df2,sr2],axis=1,sort=True)
print(result6)

# sr1과 sr3 두개의 데이터를 열기준 연결하기.
result7 = pd.concat([sr1,sr3],axis=1)
print(result7)
print(type(result7))

# sr1과 sr3 두개의 데이터를 행기준 연결하기.
result8 = pd.concat([sr1,sr3],axis=0)
print(result8)
print(type(result8))
result8 = pd.concat([sr1,sr3],axis=0,ignore_index=True)
print(result8)
print(type(result8))

stockprice.xlsx
0.01MB
stockvaluation.xlsx
0.01MB


# stockprice.xlsx, stockvaluation.xlsx 파일을 읽어서
# 각각 df1, df2 저장하기
df1 = pd.read_excel("stockprice.xlsx",engine="openpyxl")
df2 = pd.read_excel("stockvaluation.xlsx",engine="openpyxl")
print(df1)
print(df2)

# df1,df2를 열기준으로 연결하기
result9 = pd.concat([df1,df2],axis=1)
print(result9)
print(result9.columns)

# df1,df2를 열기준으로 병합하기
result10 = pd.merge(df1,df2)
print(result10)
print(result10.columns)

# 병합시, 두개의 데이터프레임을 연결 컬럼을 설정하기 
# 연결 컬럼을 키(key)라 한다.
# outer 방식
result11 = pd.merge(df1,df2,on='id',how='outer')
print(result11)
print(result11.columns)

# 병합시 사용되는 키의 이름이 다른 경우
# how = 'left' : 왼쪽 데이터셋(df1)의 내용 모두 조회 
# left_on = 'stock_name', 'right_on'='name' : 키 값을 설정
#                                             키로 사용되는 컬럼명이 양쪽이 다르다.
result12 = pd.merge(df1,df2, how='left', left_on='stock_name', right_on='name')
print(result12)
print(result12.columns)
print(df1)

#df2 데이터셋의 내용을 모두 조회되도록, 양쪽의 키는 
# df1의 stock_name,df2의 name 컬럼으로 한다.
# pd.set_option ipython의 출력 부분 설정
pd.set_option('display.max_columns',10)
pd.set_option('display.max_colwidth',20)

result13 = pd.merge(df1,df2, how='outer',left_on='stock_name',right_on='name')
print(result13)

# df1 데이터 중 금액이 50000 미만인 데이터셋과
# df2 데이터를 id 컬럼으로 병합하기 
mask1 = df1.price < 50000
df3 = df1.loc[mask1,:]
result14 = pd.merge(df3,df2,how='left',on='id')
result14

price_under = df1[df1["price"]<50000]
print(price_under)

result15 = pd.merge(price_under,df2)
result15