본문 바로가기
빅데이터분석기사

빅분기 ) 빅분기 3회 모의고사 기출 풀어보기 작업형1유형 with R

by C.Mond 2023. 10. 18.
728x90
728x90

spotify.csv
0.11MB

 

#데이터는 현재 년도별 100곡이 인기순으로 정렬되어 있다. 각 년도별 1~100위의 랭킹을 나타내는 rank컬럼을 만들고 매년도 1위의 bpm컬럼의 평균값을 구하여라
library(dplyr)
library(data.table)


df
length(unique(df$year.released))
unique(df$top.year)

df[is.na(df$top.year),]

apply(df,2,function(x){
  sum(is.na(x))
})
df <- na.omit(df)

df <- df %>% 
  group_by(top.year) %>% 
  mutate(rank=c(1:100)) %>% 
  as.data.table()

df_first <- df %>% 
  filter(rank==1)
  
mean(df_first$bpm)


125.6


#2015년도에 가장많은 top100곡을 올린 artist는 누구인가?

df_2015 <- df %>% 
  filter(top.year==2015)

names(sort(table(df_2015$artist))[length(sort(table(df_2015$artist)))])


###년도별 rank값이 1~10위 까지의 곡들 중 두번째로 많은 top genre는 무엇인가?

df_1_10 <- df %>% 
  filter(rank%in%c(1:10))

names(sort(table(df_1_10$top.genre),decreasing = T)[2])
names(sort(table(df_1_10$top.genre),decreasing = T)[3])

> names(sort(table(df_1_10$top.genre),decreasing = T)[2])
[1] "atl hip hop"
> names(sort(table(df_1_10$top.genre),decreasing = T)[3])
[1] "british soul"

 


#피처링의 경우 title에 표시된다. 피처링을 가장 많이 해준 가수는 누구인가?
library(stringr)

feat_list <- df$title[str_detect(df$title,pattern="\\(feat.")]

str_split(feat_list,"\\(",simplify = T)

locate <- str_locate(feat_list,"feat.")[,"end"]


feat_list_r1 <- str_sub(feat_list,locate+2)

feat_list_r2 <- str_split(feat_list_r1,"\\)",simplify = T)[,1]

sort(table(feat_list_r2),decreasing = T)

                                         Bruno Mars 

 

 

 


#top year 년도를 기준으로 발매일(year released)과 top100에 진입한 일자 (top.year)가 다른 곡의 숫자를 count 했을때 가장 많은 년도는?
df$rel_top_differ <- ifelse(df$year.released==df$top.year,1,0)

df_cnt <- df %>% 
  group_by(top.year) %>% 
  summarise(cnt=sum(rel_top_differ)) %>% 
  arrange(desc(cnt))

df_cnt[1,]$top.year

2018

#artist 컬럼의 값에 대소문자 상관없이 q 단어가 들어가는 아티스트는 몇명인가?

sum(str_detect(unique(tolower(df$artist)),"q"))



6

#년도 상관없이 전체데이터에서 1~50위와 51~100위간의 dur 컬럼의 평균값의 차이

df_50 <- df %>% 
  filter(rank%in%c(1:50))

df_100 <- df %>% 
  filter(rank%in%c(51:100))


mean(df_50$dur)-mean(df_100$dur)


0.896

#title을 띄어쓰기 단어로 구분 했을때 가장 많이 나온 단어는 무엇인가? (대소문자 구분 x)




sort(table((str_split(tolower(df$title)," ",simplify = T))),decreasing = T)[3]

the


#년도별 nrgy값의 평균값을 구할때 최대 평균값과 최소 평균값의 차이를 구하여라
df_avg <- df %>% 
  group_by(top.year) %>% 
  summarise(avg=mean(nrgy)) %>% 
  arrange()


df_avg_list <- df_avg[c(1,10),]$avg
df_avg_list[1]-df_avg_list[2]

13.86


#artist중 artist type 타입을 여러개 가지고 있는 artist는 누구인가

df_new <- df[,c("artist","artist.type")]

df_new_group <- df_new %>% 
  group_by(artist) %>% 
  summarise(cnt=length(unique(artist.type))) %>% 
  arrange(desc(cnt))
df_new_group[1,]$artist

 

 

 

 

Rudimental

728x90
728x90

댓글