dplyr_demo

R Laydies

Szupei Wang
2017/07/31

Outline

  • filter()
  • mutate()
  • arrange()
  • select()
  • summarise()

NBA_DATA

NBA = read.csv("C:/Users/szwang/Downloads/NBADATA.csv")
colnames(NBA)
 [1] "X"        "Players"  "MP"       "FG"       "FGA"      "FG_P"    
 [7] "X3P"      "X3PA"     "X3P_P"    "FT"       "FTA"      "FT_P"    
[13] "ORB"      "DRB"      "TRB"      "AST"      "STL"      "BLK"     
[19] "TOV"      "PF"       "PTS"      "dates"    "match"    "team"    
[25] "vs"       "season"   "position"

filter()

Filter rows

library(dplyr)
NBA_15_16 = NBA %>% 
  filter(season == '2015-2016')
NBA_14_16 = NBA %>% 
  filter(season == '2015-2016' | season == '2014-2015')
NBA_15_16_GW = NBA %>% 
  filter(season == '2015-2016' & team == 'Golden State Warriors')

mutate()

Add new columns

NBA_throw = NBA %>% 
  mutate(throw = FGA + X3PA)
head(mutate(NBA,throw = FGA + X3PA),1)
  X        Players       MP FG FGA  FG_P X3P X3PA X3P_P FT FTA  FT_P ORB
1 1 Michael Jordan 43:00:00 10  22 0.455   0    5     0 10  13 0.769   1
  DRB TRB AST STL BLK TOV PF PTS     dates match          team
1   3   4   3   0   0   1  2  30 1996/11/1  home Chicago Bulls
              vs    season position throw
1 Boston Celtics 1996-1997  Starter    27

arrange()

NBA_season = iris %>% 
  arrange(Sepal.Length)

head(arrange(iris,Sepal.Length))
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          4.3         3.0          1.1         0.1  setosa
2          4.4         2.9          1.4         0.2  setosa
3          4.4         3.0          1.3         0.2  setosa
4          4.4         3.2          1.3         0.2  setosa
5          4.5         2.3          1.3         0.3  setosa
6          4.6         3.1          1.5         0.2  setosa

select()

Select columns

NBA_select = NBA %>% 
  select(season,team,PTS)

head(select(NBA,season,team,PTS))
     season          team PTS
1 1996-1997 Chicago Bulls  30
2 1996-1997 Chicago Bulls  18
3 1996-1997 Chicago Bulls  13
4 1996-1997 Chicago Bulls   7
5 1996-1997 Chicago Bulls   4
6 1996-1997 Chicago Bulls  20

summarise()

Summarise values

NBA_summary = NBA %>% 
  summarise(mean(PTS),mean(TOV))

summarise(NBA,sum(PTS),mean(TOV))
  sum(PTS) mean(TOV)
1  4606223  1.378494

Combine 1

NBA_16_GW = NBA %>% 
  filter(season == '2015-2016' & team == 'Golden State Warriors') %>% 
  mutate(throw = FGA + X3PA) %>% 
  select(Players,team,match,vs,PTS)
head(NBA_16_GW)
          Players                  team match                   vs PTS
1   Stephen Curry Golden State Warriors  away New Orleans Pelicans  40
2 Harrison Barnes Golden State Warriors  away New Orleans Pelicans   8
3  Draymond Green Golden State Warriors  away New Orleans Pelicans  10
4   Klay Thompson Golden State Warriors  away New Orleans Pelicans   9
5    Andrew Bogut Golden State Warriors  away New Orleans Pelicans  12
6  Andre Iguodala Golden State Warriors  away New Orleans Pelicans   2

Combine 2

NBA %>% 
  filter(season == '2015-2016' & team == 'Golden State Warriors') %>% 
  mutate(throw = FGA + X3PA) %>% 
  select(Players,team,match,vs,PTS,throw) %>% 
  group_by(vs,match) %>% 
  summarise(value=sum(PTS))
# A tibble: 58 x 3
# Groups:   vs [?]
                  vs  match value
              <fctr> <fctr> <int>
 1     Atlanta Hawks   away   109
 2     Atlanta Hawks   home   102
 3    Boston Celtics   away   106
 4    Boston Celtics   home   124
 5     Brooklyn Nets   away   107
 6     Brooklyn Nets   home   114
 7 Charlotte Hornets   away   111
 8 Charlotte Hornets   home   116
 9     Chicago Bulls   away   106
10     Chicago Bulls   home   125
# ... with 48 more rows

Combine 3

NBA %>% 
  filter(season == '2015-2016') %>%
  group_by(team,vs,dates,match) %>% 
  select(team,vs,match,PTS) %>%
  summarise(sum(PTS)) %>% 
  distinct()
# A tibble: 2,460 x 5
# Groups:   team, vs, dates [2,460]
            team                vs      dates  match `sum(PTS)`
          <fctr>            <fctr>     <fctr> <fctr>      <int>
 1 Atlanta Hawks    Boston Celtics 2015/11/13   home         93
 2 Atlanta Hawks    Boston Celtics 2015/11/24   away        121
 3 Atlanta Hawks    Boston Celtics 2015/12/18   home        109
 4 Atlanta Hawks    Boston Celtics   2016/4/9   away        118
 5 Atlanta Hawks     Brooklyn Nets 2015/11/17   home         88
 6 Atlanta Hawks     Brooklyn Nets  2015/11/4   away        101
 7 Atlanta Hawks     Brooklyn Nets  2016/1/16   away        114
 8 Atlanta Hawks Charlotte Hornets 2015/10/30   away         97
 9 Atlanta Hawks Charlotte Hornets  2015/11/1   home         94
10 Atlanta Hawks Charlotte Hornets  2016/1/13   home         84
# ... with 2,450 more rows

dplyr + ggplot2

library(ggplot2)
NBA %>% 
  filter(season == '2015-2016') %>%
  group_by(team) %>% 
  summarise(value=sum(PTS)) %>%
  ggplot(aes(x = team ,y = value)) + 
           geom_bar(stat = "identity")

plot of chunk unnamed-chunk-10

join

  • inner_join
  • left_join
  • right_join
  • full_join

join

inner_join

A = data.frame(name=c('A','B','C'),value1=c(1,2,3))
B = data.frame(name=c('A','D','B'),value2=c(9,8,7))
A %>% inner_join(B,by='name')
  name value1 value2
1    A      1      9
2    B      2      7

join

left_join

A = data.frame(name=c('A','B','C'),value1=c(1,2,3))
B = data.frame(name=c('A','D','B'),value2=c(9,8,7))
A %>% left_join(B,by='name')
  name value1 value2
1    A      1      9
2    B      2      7
3    C      3     NA

join

right_join

A = data.frame(name=c('A','B','C'),value1=c(1,2,3))
B = data.frame(name=c('A','D','B'),value2=c(9,8,7))
A %>% right_join(B,by='name')
  name value1 value2
1    A      1      9
2    D     NA      8
3    B      2      7

join

full_join

A = data.frame(name=c('A','B','C'),value1=c(1,2,3))
B = data.frame(name=c('A','D','B'),value2=c(9,8,7))
A %>% full_join(B,by='name')
  name value1 value2
1    A      1      9
2    B      2      7
3    C      3     NA
4    D     NA      8