728x90
import pandas as pd
문제1. user_id를 인덱스로 사용하여 occupation.tsv 데이터프레임 읽기
df = pd.read_csv("./data/occupation.tsv", sep="|", index_col = "user_id")
df.head()

문제2. 각 occupation별 age의 평균 구하기
df.groupby("occupation")["age"].mean()
occupation
administrator 38.746835
artist 31.392857
doctor 43.571429
educator 42.010526
engineer 36.388060
entertainment 29.222222
executive 38.718750
healthcare 41.562500
homemaker 32.571429
lawyer 36.750000
librarian 40.000000
marketing 37.615385
none 26.555556
other 34.523810
programmer 33.121212
retired 63.071429
salesman 35.666667
scientist 35.548387
student 22.081633
technician 33.148148
writer 36.311111
Name: age, dtype: float64
문제3. 각 occupation 별 남자의 비율 구하기
- 단, 남자 비율의 내림차순으로 정렬하여 출력하기
df["male_n"] = df["gender"].map(lambda x: 1 if x == "M" else 0)
df.head()

sum_male = df.groupby("occupation")["male_n"].sum()
sum_male
occupation
administrator 43
artist 15
doctor 7
educator 69
engineer 65
entertainment 16
executive 29
healthcare 5
homemaker 1
lawyer 10
librarian 22
marketing 16
none 5
other 69
programmer 60
retired 13
salesman 9
scientist 28
student 136
technician 26
writer 26
Name: male_n, dtype: int64
sum_all = df["occupation"].value_counts()
(sum_male / sum_all).sort_values(ascending = False)
occupation
doctor 1.000000
engineer 0.970149
technician 0.962963
retired 0.928571
programmer 0.909091
executive 0.906250
scientist 0.903226
entertainment 0.888889
lawyer 0.833333
salesman 0.750000
educator 0.726316
student 0.693878
other 0.657143
marketing 0.615385
writer 0.577778
none 0.555556
administrator 0.544304
artist 0.535714
librarian 0.431373
healthcare 0.312500
homemaker 0.142857
dtype: float64
[156]
# 다른풀이
남자수합계/데이터수 == 평균
df.groupby("occupation")["male_n"].mean().sort_values(ascending = False)
occupation
doctor 1.000000
engineer 0.970149
technician 0.962963
retired 0.928571
programmer 0.909091
executive 0.906250
scientist 0.903226
entertainment 0.888889
lawyer 0.833333
salesman 0.750000
educator 0.726316
student 0.693878
other 0.657143
marketing 0.615385
writer 0.577778
none 0.555556
administrator 0.544304
artist 0.535714
librarian 0.431373
healthcare 0.312500
homemaker 0.142857
Name: male_n, dtype: float64
df[df["gender"] == "M"].groupby("occupation")["gender"].count() / df["occupation"].value_counts()
occupation
administrator 0.544304
artist 0.535714
doctor 1.000000
educator 0.726316
engineer 0.970149
entertainment 0.888889
executive 0.906250
healthcare 0.312500
homemaker 0.142857
lawyer 0.833333
librarian 0.431373
marketing 0.615385
none 0.555556
other 0.657143
programmer 0.909091
retired 0.928571
salesman 0.750000
scientist 0.903226
student 0.693878
technician 0.962963
writer 0.577778
dtype: float64
(df.groupby(["occupation", "gender"]).size() / df["occupation"].value_counts()).xs("M", level = "gender")
occupation
administrator 0.544304
artist 0.535714
doctor 1.000000
educator 0.726316
engineer 0.970149
entertainment 0.888889
executive 0.906250
healthcare 0.312500
homemaker 0.142857
lawyer 0.833333
librarian 0.431373
marketing 0.615385
none 0.555556
other 0.657143
programmer 0.909091
retired 0.928571
salesman 0.750000
scientist 0.903226
student 0.693878
technician 0.962963
writer 0.577778
dtype: float64
df.groupby(["occupation", "gender"]).value_counts()
occupation gender age zip_code male_n
administrator F 22 60202 0 1
23 20817 0 1
25 80538 0 1
27 19711 0 1
97214 0 1
..
writer M 50 27105 1 1
51 95468 1 1
52 08534 1 1
59801 1 1
60 94583 1 1
Name: count, Length: 936, dtype: int64
df["occupation"].size
943
문제4. occupation별 age의 최솟값, 최댓값 구하기
df.groupby("occupation")["age"].agg(["min", "max"])

문제5. 각 occupation의 gender별 평균 age를 구하기
df.groupby(["occupation","gender"])["age"].mean()
occupation gender
administrator F 40.638889
M 37.162791
artist F 30.307692
M 32.333333
doctor M 43.571429
educator F 39.115385
M 43.101449
engineer F 29.500000
M 36.600000
entertainment F 31.000000
M 29.000000
executive F 44.000000
M 38.172414
healthcare F 39.818182
M 45.400000
homemaker F 34.166667
M 23.000000
lawyer F 39.500000
M 36.200000
librarian F 40.000000
M 40.000000
marketing F 37.200000
M 37.875000
none F 36.500000
M 18.600000
other F 35.472222
M 34.028986
programmer F 32.166667
M 33.216667
retired F 70.000000
M 62.538462
salesman F 27.000000
M 38.555556
scientist F 28.333333
M 36.321429
student F 20.750000
M 22.669118
technician F 38.000000
M 32.961538
writer F 37.631579
M 35.346154
Name: age, dtype: float64'05_Pandas' 카테고리의 다른 글
| 07_서울교통공사_에스컬레이터 설치 정보(1~9호선) (1) | 2025.03.27 |
|---|---|
| 06_서울교통공사_역사면적정보 (0) | 2025.03.27 |
| 05-4_연습문제_Vaccine (0) | 2025.03.07 |
| 05-3_연습문제_Fictional_Army (1) | 2025.03.06 |
| 05-2_연습문제_euro2012 (0) | 2025.03.06 |