Filtering players with greater than 500 at bats

bdata <- Batting %>%
           filter(AB > 500)%>%
           select(playerID, HR, yearID) %>%
           arrange(desc(HR))
## Warning: package 'bindrcpp' was built under R version 3.5.2
head(bdata)
##    playerID HR yearID
## 1 mcgwima01 70   1998
## 2  sosasa01 66   1998
## 3 mcgwima01 65   1999
## 4  sosasa01 64   2001
## 5  sosasa01 63   1999
## 6 marisro01 61   1961

Using & (and) and | (or) to filter with greater detail

bdata2 <- Batting %>%
            filter(AB > 500 & (HR >= 50 | SO < 25))%>%
            select(playerID, HR, yearID) %>%
            arrange(desc(HR))
head(bdata2)
##    playerID HR yearID
## 1 mcgwima01 70   1998
## 2  sosasa01 66   1998
## 3 mcgwima01 65   1999
## 4  sosasa01 64   2001
## 5  sosasa01 63   1999
## 6 marisro01 61   1961

Summing HR data by player

bdata3 <- Batting %>%
            group_by(playerID) %>%
            summarize(career_HR = sum(HR, na.rm = TRUE)) %>%
            filter(career_HR >= 600) %>%
            arrange(desc(career_HR))
head(bdata3)
## # A tibble: 6 x 2
##   playerID  career_HR
##   <chr>         <int>
## 1 bondsba01       762
## 2 aaronha01       755
## 3 ruthba01        714
## 4 rodrial01       696
## 5 mayswi01        660
## 6 griffke02       630

Averaging season hit total by player

bdata4 <- Batting %>%
            group_by(playerID) %>%
            summarize(avg_season_H = round(mean(H, na.rm = TRUE), 2)) %>%
            arrange(desc(avg_season_H))
head(bdata4)
## # A tibble: 6 x 2
##   playerID  avg_season_H
##   <chr>            <dbl>
## 1 puckeki01         192 
## 2 canoro01          184.
## 3 cabremi01         180.
## 4 abreujo02         179 
## 5 suzukic01         178.
## 6 burkeje01         178.

Identifying minimum and maximum HR by player

bdata5 <- Batting %>%
            group_by(playerID) %>%
            summarize(Max_HR = max(HR), Min_HR = min(HR)) %>%
            arrange(desc(Max_HR))
head(bdata5)
## # A tibble: 6 x 3
##   playerID  Max_HR Min_HR
##   <chr>      <dbl>  <dbl>
## 1 bondsba01     73      5
## 2 mcgwima01     70      3
## 3 sosasa01      66      1
## 4 marisro01     61      5
## 5 ruthba01      60      0
## 6 foxxji01      58      0

Count number of unique records for each player

bdata6 <- Batting %>%
            group_by(playerID) %>%
            summarize(Unique_records = n())
head(bdata6)
## # A tibble: 6 x 2
##   playerID  Unique_records
##   <chr>              <int>
## 1 aardsda01              9
## 2 aaronha01             23
## 3 aaronto01              7
## 4 aasedo01              13
## 5 abadan01               3
## 6 abadfe01               8

Strikeouts by player

bdata7 <- Batting %>%
            filter(AB >= 400) %>%
            group_by(playerID) %>%
            summarize(Min_SO = min(SO, na.rm = TRUE)) %>%
            filter(Min_SO < 20) %>%
            arrange(Min_SO)
head(bdata7)
## # A tibble: 6 x 2
##   playerID  Min_SO
##   <chr>      <dbl>
## 1 doyleja01      3
## 2 seweljo01      3
## 3 holloch01      5
## 4 mcinnst01      5
## 5 wanerll01      5
## 6 wardjo01       5

Batting AVG by player (best year)

bdata8 <- Batting %>%
            filter(AB >= 400) %>%
            group_by(playerID) %>%
            mutate(Batting_AVG = round(H/AB, 3)) %>%
            select(playerID, Batting_AVG, yearID) %>%
            arrange(desc(Batting_AVG))
head(bdata8)
## # A tibble: 6 x 3
## # Groups:   playerID [6]
##   playerID  Batting_AVG yearID
##   <chr>           <dbl>  <int>
## 1 duffyhu01       0.44    1894
## 2 oneilti01       0.435   1887
## 3 lajoina01       0.426   1901
## 4 keelewi01       0.424   1897
## 5 hornsro01       0.424   1924
## 6 cobbty01        0.42    1911

Batting AVG by player (career)

bdata9 <- Batting %>%
            group_by(playerID) %>%
            summarize(Career_H = sum(H, na.rm = TRUE), Career_AB = sum(AB, na.rm = TRUE)) %>%
            filter(Career_AB >= 1000) %>%
            mutate(Career_AVG = round(Career_H/Career_AB, 3)) %>%
            select(playerID, Career_AVG) %>%
            arrange(desc(Career_AVG))
head(bdata9)
## # A tibble: 6 x 2
##   playerID  Career_AVG
##   <chr>          <dbl>
## 1 cobbty01       0.366
## 2 barnero01      0.359
## 3 hornsro01      0.358
## 4 jacksjo01      0.356
## 5 meyerle01      0.356
## 6 odoulle01      0.349

Joining the biological information to the batting data

Join <- inner_join (Batting, Master, by = c("playerID")) %>%
          filter(playerID == "ruthba01" | playerID == "aaronha01") %>%
          arrange(playerID)
head(Join)
##    playerID yearID stint teamID lgID   G  AB   R   H X2B X3B HR RBI SB CS
## 1 aaronha01   1954     1    ML1   NL 122 468  58 131  27   6 13  69  2  2
## 2 aaronha01   1955     1    ML1   NL 153 602 105 189  37   9 27 106  3  1
## 3 aaronha01   1956     1    ML1   NL 153 609 106 200  34  14 26  92  2  4
## 4 aaronha01   1957     1    ML1   NL 151 615 118 198  27   6 44 132  1  1
## 5 aaronha01   1958     1    ML1   NL 153 601 109 196  34   4 30  95  4  1
## 6 aaronha01   1959     1    ML1   NL 154 629 116 223  46   7 39 123  8  0
##   BB SO IBB HBP SH SF GIDP birthYear birthMonth birthDay birthCountry
## 1 28 39  NA   3  6  4   13      1934          2        5          USA
## 2 49 61   5   3  7  4   20      1934          2        5          USA
## 3 37 54   6   2  5  7   21      1934          2        5          USA
## 4 57 58  15   0  0  3   13      1934          2        5          USA
## 5 59 49  16   1  0  3   21      1934          2        5          USA
## 6 51 54  17   4  0  9   19      1934          2        5          USA
##   birthState birthCity deathYear deathMonth deathDay deathCountry
## 1         AL    Mobile        NA         NA       NA         <NA>
## 2         AL    Mobile        NA         NA       NA         <NA>
## 3         AL    Mobile        NA         NA       NA         <NA>
## 4         AL    Mobile        NA         NA       NA         <NA>
## 5         AL    Mobile        NA         NA       NA         <NA>
## 6         AL    Mobile        NA         NA       NA         <NA>
##   deathState deathCity nameFirst nameLast   nameGiven weight height bats
## 1       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 2       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 3       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 4       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 5       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 6       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
##   throws      debut  finalGame  retroID   bbrefID deathDate  birthDate
## 1      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 2      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 3      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 4      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 5      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 6      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05

Alternative way to join the data

data <- Batting %>%
          filter(playerID == "ruthba01" | playerID == "aaronha01")

Join2 <- inner_join(data, Master, by = c("playerID")) %>%
          arrange(playerID)
head(Join2)
##    playerID yearID stint teamID lgID   G  AB   R   H X2B X3B HR RBI SB CS
## 1 aaronha01   1954     1    ML1   NL 122 468  58 131  27   6 13  69  2  2
## 2 aaronha01   1955     1    ML1   NL 153 602 105 189  37   9 27 106  3  1
## 3 aaronha01   1956     1    ML1   NL 153 609 106 200  34  14 26  92  2  4
## 4 aaronha01   1957     1    ML1   NL 151 615 118 198  27   6 44 132  1  1
## 5 aaronha01   1958     1    ML1   NL 153 601 109 196  34   4 30  95  4  1
## 6 aaronha01   1959     1    ML1   NL 154 629 116 223  46   7 39 123  8  0
##   BB SO IBB HBP SH SF GIDP birthYear birthMonth birthDay birthCountry
## 1 28 39  NA   3  6  4   13      1934          2        5          USA
## 2 49 61   5   3  7  4   20      1934          2        5          USA
## 3 37 54   6   2  5  7   21      1934          2        5          USA
## 4 57 58  15   0  0  3   13      1934          2        5          USA
## 5 59 49  16   1  0  3   21      1934          2        5          USA
## 6 51 54  17   4  0  9   19      1934          2        5          USA
##   birthState birthCity deathYear deathMonth deathDay deathCountry
## 1         AL    Mobile        NA         NA       NA         <NA>
## 2         AL    Mobile        NA         NA       NA         <NA>
## 3         AL    Mobile        NA         NA       NA         <NA>
## 4         AL    Mobile        NA         NA       NA         <NA>
## 5         AL    Mobile        NA         NA       NA         <NA>
## 6         AL    Mobile        NA         NA       NA         <NA>
##   deathState deathCity nameFirst nameLast   nameGiven weight height bats
## 1       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 2       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 3       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 4       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 5       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
## 6       <NA>      <NA>      Hank    Aaron Henry Louis    180     72    R
##   throws      debut  finalGame  retroID   bbrefID deathDate  birthDate
## 1      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 2      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 3      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 4      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 5      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05
## 6      R 1954-04-13 1976-10-03 aaroh101 aaronha01      <NA> 1934-02-05

Create a new database with player names, team, year, and HR then sort by most HR

bdat <- Batting %>%
          select(playerID, teamID, yearID, HR)

Join3 <- inner_join(bdat, Master, by = c("playerID")) %>%
          select(nameFirst, nameLast, teamID, yearID, HR) %>%
          arrange(desc(HR))
head(Join3)
##   nameFirst nameLast teamID yearID HR
## 1     Barry    Bonds    SFN   2001 73
## 2      Mark  McGwire    SLN   1998 70
## 3     Sammy     Sosa    CHN   1998 66
## 4      Mark  McGwire    SLN   1999 65
## 5     Sammy     Sosa    CHN   2001 64
## 6     Sammy     Sosa    CHN   1999 63

Joining batting data with team data (Babe Ruth)

bdat <- Batting %>%
          filter(playerID == "ruthba01") %>%
          select(playerID, teamID, yearID, HR)

Join4 <- inner_join(bdat, Teams, by = c("teamID", "yearID")) %>%
          select(playerID, name, yearID, HR = HR.x)
Join4
##    playerID             name yearID HR
## 1  ruthba01   Boston Red Sox   1914  0
## 2  ruthba01   Boston Red Sox   1915  4
## 3  ruthba01   Boston Red Sox   1916  3
## 4  ruthba01   Boston Red Sox   1917  2
## 5  ruthba01   Boston Red Sox   1918 11
## 6  ruthba01   Boston Red Sox   1919 29
## 7  ruthba01 New York Yankees   1920 54
## 8  ruthba01 New York Yankees   1921 59
## 9  ruthba01 New York Yankees   1922 35
## 10 ruthba01 New York Yankees   1923 41
## 11 ruthba01 New York Yankees   1924 46
## 12 ruthba01 New York Yankees   1925 25
## 13 ruthba01 New York Yankees   1926 47
## 14 ruthba01 New York Yankees   1927 60
## 15 ruthba01 New York Yankees   1928 54
## 16 ruthba01 New York Yankees   1929 46
## 17 ruthba01 New York Yankees   1930 49
## 18 ruthba01 New York Yankees   1931 46
## 19 ruthba01 New York Yankees   1932 41
## 20 ruthba01 New York Yankees   1933 34
## 21 ruthba01 New York Yankees   1934 22
## 22 ruthba01    Boston Braves   1935  6

Joining batting data with Master and team data (Babe Ruth)

bdat <- Batting %>%
          filter(playerID == "ruthba01") %>%
          select(playerID, teamID, yearID, HR)

Join4 <- inner_join(bdat, Master, by = c("playerID")) %>%
            select(nameFirst, nameLast, teamID, yearID, HR)

Join4.1 <- inner_join(Join4, Teams, by = c("teamID", "yearID")) %>%
              select(nameFirst, nameLast, name, yearID, HR = HR.x)
Join4.1
##    nameFirst nameLast             name yearID HR
## 1       Babe     Ruth   Boston Red Sox   1914  0
## 2       Babe     Ruth   Boston Red Sox   1915  4
## 3       Babe     Ruth   Boston Red Sox   1916  3
## 4       Babe     Ruth   Boston Red Sox   1917  2
## 5       Babe     Ruth   Boston Red Sox   1918 11
## 6       Babe     Ruth   Boston Red Sox   1919 29
## 7       Babe     Ruth New York Yankees   1920 54
## 8       Babe     Ruth New York Yankees   1921 59
## 9       Babe     Ruth New York Yankees   1922 35
## 10      Babe     Ruth New York Yankees   1923 41
## 11      Babe     Ruth New York Yankees   1924 46
## 12      Babe     Ruth New York Yankees   1925 25
## 13      Babe     Ruth New York Yankees   1926 47
## 14      Babe     Ruth New York Yankees   1927 60
## 15      Babe     Ruth New York Yankees   1928 54
## 16      Babe     Ruth New York Yankees   1929 46
## 17      Babe     Ruth New York Yankees   1930 49
## 18      Babe     Ruth New York Yankees   1931 46
## 19      Babe     Ruth New York Yankees   1932 41
## 20      Babe     Ruth New York Yankees   1933 34
## 21      Babe     Ruth New York Yankees   1934 22
## 22      Babe     Ruth    Boston Braves   1935  6

Summing HR by player while joining Batting with Master

bdat <- Batting %>%
          group_by(playerID) %>%
          summarize(career_HR = sum(HR, na.rm = TRUE))

Join5 <- inner_join(bdat, Master, by = c("playerID")) %>%
            select(nameFirst, nameLast, career_HR) %>%
            arrange(desc(career_HR))
head(Join5)
## # A tibble: 6 x 3
##   nameFirst nameLast  career_HR
##   <chr>     <chr>         <int>
## 1 Barry     Bonds           762
## 2 Hank      Aaron           755
## 3 Babe      Ruth            714
## 4 Alex      Rodriguez       696
## 5 Willie    Mays            660
## 6 Ken       Griffey         630

Alternative method for Join5

Join5.1 <- inner_join(Batting, Master, by = c("playerID")) %>%
              group_by(playerID) %>%
              summarize(First_name = nameFirst[1], Last_name = nameLast[1], career_HR = sum(HR, na.rm = TRUE)) %>%
              select(First_name, Last_name, career_HR) %>%
              arrange(desc(career_HR))
head(Join5.1)
## # A tibble: 6 x 3
##   First_name Last_name career_HR
##   <chr>      <chr>         <int>
## 1 Barry      Bonds           762
## 2 Hank       Aaron           755
## 3 Babe       Ruth            714
## 4 Alex       Rodriguez       696
## 5 Willie     Mays            660
## 6 Ken        Griffey         630

Project1 <-

Generate a list of all the players who played for a team whose park was Petco Park.

Each such player should be listed with his first and last name.

No player should be listed more than once.

Limit the list to players in the Batting table.

tdat <- Teams %>%
           filter(park == "Petco Park") %>%
           select(teamID, yearID)
We realize the dates for Petco are 2004 thru 2016 so we can filter directly from the Batting data
bdat <- Batting %>%
          filter(teamID == "SDN", yearID >= 2004, yearID <= 2016) %>%
          select(playerID)

Project1 <- inner_join(bdat, Master, by = c("playerID")) %>%
            group_by(playerID) %>%
            summarize(First_name = nameFirst[1], Last_name = nameLast[1]) %>%
            select(First_name, Last_name)
summary(Project1)            
##   First_name         Last_name        
##  Length:369         Length:369        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Project2 <-

List all players named “Bob” who averaged more than $1 Million per year in salary for their career.

Include last names.

bdat <- inner_join(Batting, Master, by = c("playerID")) %>%
              group_by(playerID) %>%
              summarize(First_name = nameFirst[1], Last_name = nameLast[1]) %>%
              select(playerID, First_name, Last_name)

salary <- Salaries %>%
            group_by(playerID) %>%
            summarize(AVG_Salary = mean(salary)) %>%
            filter(AVG_Salary > 1000000) %>%
            select(playerID, AVG_Salary)

Project2 <- inner_join(salary, bdat, by = c("playerID")) %>%
              select(First_name, Last_name, AVG_Salary) %>%
              filter(First_name == "Bob") %>%
              arrange(desc(AVG_Salary))
Project2
## # A tibble: 8 x 3
##   First_name Last_name AVG_Salary
##   <chr>      <chr>          <dbl>
## 1 Bob        Wickman     2635812.
## 2 Bob        Welch       1930417.
## 3 Bob        Howry       1804583.
## 4 Bob        Horner      1416667.
## 5 Bob        Tewksbury   1288182.
## 6 Bob        Stanley     1067500 
## 7 Bob        Boone       1047580 
## 8 Bob        Ojeda       1035926.

Much of the learnings were acquired through Udemy’s “Baseball Database Queries with SQL and dplyr” taught by Charles Redmond.

link