Filtering players with greater than 500 at bats
bdata <- Batting %>%
filter(AB > 500)%>%
select(playerID, HR, yearID) %>%
arrange(desc(HR))
## Warning: package 'bindrcpp' was built under R version 3.5.2
head(bdata)
## playerID HR yearID
## 1 mcgwima01 70 1998
## 2 sosasa01 66 1998
## 3 mcgwima01 65 1999
## 4 sosasa01 64 2001
## 5 sosasa01 63 1999
## 6 marisro01 61 1961
Using & (and) and | (or) to filter with greater detail
bdata2 <- Batting %>%
filter(AB > 500 & (HR >= 50 | SO < 25))%>%
select(playerID, HR, yearID) %>%
arrange(desc(HR))
head(bdata2)
## playerID HR yearID
## 1 mcgwima01 70 1998
## 2 sosasa01 66 1998
## 3 mcgwima01 65 1999
## 4 sosasa01 64 2001
## 5 sosasa01 63 1999
## 6 marisro01 61 1961
Summing HR data by player
bdata3 <- Batting %>%
group_by(playerID) %>%
summarize(career_HR = sum(HR, na.rm = TRUE)) %>%
filter(career_HR >= 600) %>%
arrange(desc(career_HR))
head(bdata3)
## # A tibble: 6 x 2
## playerID career_HR
## <chr> <int>
## 1 bondsba01 762
## 2 aaronha01 755
## 3 ruthba01 714
## 4 rodrial01 696
## 5 mayswi01 660
## 6 griffke02 630

Averaging season hit total by player
bdata4 <- Batting %>%
group_by(playerID) %>%
summarize(avg_season_H = round(mean(H, na.rm = TRUE), 2)) %>%
arrange(desc(avg_season_H))
head(bdata4)
## # A tibble: 6 x 2
## playerID avg_season_H
## <chr> <dbl>
## 1 puckeki01 192
## 2 canoro01 184.
## 3 cabremi01 180.
## 4 abreujo02 179
## 5 suzukic01 178.
## 6 burkeje01 178.
Identifying minimum and maximum HR by player
bdata5 <- Batting %>%
group_by(playerID) %>%
summarize(Max_HR = max(HR), Min_HR = min(HR)) %>%
arrange(desc(Max_HR))
head(bdata5)
## # A tibble: 6 x 3
## playerID Max_HR Min_HR
## <chr> <dbl> <dbl>
## 1 bondsba01 73 5
## 2 mcgwima01 70 3
## 3 sosasa01 66 1
## 4 marisro01 61 5
## 5 ruthba01 60 0
## 6 foxxji01 58 0
Count number of unique records for each player
bdata6 <- Batting %>%
group_by(playerID) %>%
summarize(Unique_records = n())
head(bdata6)
## # A tibble: 6 x 2
## playerID Unique_records
## <chr> <int>
## 1 aardsda01 9
## 2 aaronha01 23
## 3 aaronto01 7
## 4 aasedo01 13
## 5 abadan01 3
## 6 abadfe01 8
Strikeouts by player
bdata7 <- Batting %>%
filter(AB >= 400) %>%
group_by(playerID) %>%
summarize(Min_SO = min(SO, na.rm = TRUE)) %>%
filter(Min_SO < 20) %>%
arrange(Min_SO)
head(bdata7)
## # A tibble: 6 x 2
## playerID Min_SO
## <chr> <dbl>
## 1 doyleja01 3
## 2 seweljo01 3
## 3 holloch01 5
## 4 mcinnst01 5
## 5 wanerll01 5
## 6 wardjo01 5
Batting AVG by player (best year)
bdata8 <- Batting %>%
filter(AB >= 400) %>%
group_by(playerID) %>%
mutate(Batting_AVG = round(H/AB, 3)) %>%
select(playerID, Batting_AVG, yearID) %>%
arrange(desc(Batting_AVG))
head(bdata8)
## # A tibble: 6 x 3
## # Groups: playerID [6]
## playerID Batting_AVG yearID
## <chr> <dbl> <int>
## 1 duffyhu01 0.44 1894
## 2 oneilti01 0.435 1887
## 3 lajoina01 0.426 1901
## 4 keelewi01 0.424 1897
## 5 hornsro01 0.424 1924
## 6 cobbty01 0.42 1911
Batting AVG by player (career)
bdata9 <- Batting %>%
group_by(playerID) %>%
summarize(Career_H = sum(H, na.rm = TRUE), Career_AB = sum(AB, na.rm = TRUE)) %>%
filter(Career_AB >= 1000) %>%
mutate(Career_AVG = round(Career_H/Career_AB, 3)) %>%
select(playerID, Career_AVG) %>%
arrange(desc(Career_AVG))
head(bdata9)
## # A tibble: 6 x 2
## playerID Career_AVG
## <chr> <dbl>
## 1 cobbty01 0.366
## 2 barnero01 0.359
## 3 hornsro01 0.358
## 4 jacksjo01 0.356
## 5 meyerle01 0.356
## 6 odoulle01 0.349
Alternative way to join the data
data <- Batting %>%
filter(playerID == "ruthba01" | playerID == "aaronha01")
Join2 <- inner_join(data, Master, by = c("playerID")) %>%
arrange(playerID)
head(Join2)
## playerID yearID stint teamID lgID G AB R H X2B X3B HR RBI SB CS
## 1 aaronha01 1954 1 ML1 NL 122 468 58 131 27 6 13 69 2 2
## 2 aaronha01 1955 1 ML1 NL 153 602 105 189 37 9 27 106 3 1
## 3 aaronha01 1956 1 ML1 NL 153 609 106 200 34 14 26 92 2 4
## 4 aaronha01 1957 1 ML1 NL 151 615 118 198 27 6 44 132 1 1
## 5 aaronha01 1958 1 ML1 NL 153 601 109 196 34 4 30 95 4 1
## 6 aaronha01 1959 1 ML1 NL 154 629 116 223 46 7 39 123 8 0
## BB SO IBB HBP SH SF GIDP birthYear birthMonth birthDay birthCountry
## 1 28 39 NA 3 6 4 13 1934 2 5 USA
## 2 49 61 5 3 7 4 20 1934 2 5 USA
## 3 37 54 6 2 5 7 21 1934 2 5 USA
## 4 57 58 15 0 0 3 13 1934 2 5 USA
## 5 59 49 16 1 0 3 21 1934 2 5 USA
## 6 51 54 17 4 0 9 19 1934 2 5 USA
## birthState birthCity deathYear deathMonth deathDay deathCountry
## 1 AL Mobile NA NA NA <NA>
## 2 AL Mobile NA NA NA <NA>
## 3 AL Mobile NA NA NA <NA>
## 4 AL Mobile NA NA NA <NA>
## 5 AL Mobile NA NA NA <NA>
## 6 AL Mobile NA NA NA <NA>
## deathState deathCity nameFirst nameLast nameGiven weight height bats
## 1 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## 2 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## 3 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## 4 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## 5 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## 6 <NA> <NA> Hank Aaron Henry Louis 180 72 R
## throws debut finalGame retroID bbrefID deathDate birthDate
## 1 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
## 2 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
## 3 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
## 4 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
## 5 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
## 6 R 1954-04-13 1976-10-03 aaroh101 aaronha01 <NA> 1934-02-05
Create a new database with player names, team, year, and HR then sort by most HR
bdat <- Batting %>%
select(playerID, teamID, yearID, HR)
Join3 <- inner_join(bdat, Master, by = c("playerID")) %>%
select(nameFirst, nameLast, teamID, yearID, HR) %>%
arrange(desc(HR))
head(Join3)
## nameFirst nameLast teamID yearID HR
## 1 Barry Bonds SFN 2001 73
## 2 Mark McGwire SLN 1998 70
## 3 Sammy Sosa CHN 1998 66
## 4 Mark McGwire SLN 1999 65
## 5 Sammy Sosa CHN 2001 64
## 6 Sammy Sosa CHN 1999 63
Joining batting data with team data (Babe Ruth)
bdat <- Batting %>%
filter(playerID == "ruthba01") %>%
select(playerID, teamID, yearID, HR)
Join4 <- inner_join(bdat, Teams, by = c("teamID", "yearID")) %>%
select(playerID, name, yearID, HR = HR.x)
Join4
## playerID name yearID HR
## 1 ruthba01 Boston Red Sox 1914 0
## 2 ruthba01 Boston Red Sox 1915 4
## 3 ruthba01 Boston Red Sox 1916 3
## 4 ruthba01 Boston Red Sox 1917 2
## 5 ruthba01 Boston Red Sox 1918 11
## 6 ruthba01 Boston Red Sox 1919 29
## 7 ruthba01 New York Yankees 1920 54
## 8 ruthba01 New York Yankees 1921 59
## 9 ruthba01 New York Yankees 1922 35
## 10 ruthba01 New York Yankees 1923 41
## 11 ruthba01 New York Yankees 1924 46
## 12 ruthba01 New York Yankees 1925 25
## 13 ruthba01 New York Yankees 1926 47
## 14 ruthba01 New York Yankees 1927 60
## 15 ruthba01 New York Yankees 1928 54
## 16 ruthba01 New York Yankees 1929 46
## 17 ruthba01 New York Yankees 1930 49
## 18 ruthba01 New York Yankees 1931 46
## 19 ruthba01 New York Yankees 1932 41
## 20 ruthba01 New York Yankees 1933 34
## 21 ruthba01 New York Yankees 1934 22
## 22 ruthba01 Boston Braves 1935 6
Joining batting data with Master and team data (Babe Ruth)
bdat <- Batting %>%
filter(playerID == "ruthba01") %>%
select(playerID, teamID, yearID, HR)
Join4 <- inner_join(bdat, Master, by = c("playerID")) %>%
select(nameFirst, nameLast, teamID, yearID, HR)
Join4.1 <- inner_join(Join4, Teams, by = c("teamID", "yearID")) %>%
select(nameFirst, nameLast, name, yearID, HR = HR.x)
Join4.1
## nameFirst nameLast name yearID HR
## 1 Babe Ruth Boston Red Sox 1914 0
## 2 Babe Ruth Boston Red Sox 1915 4
## 3 Babe Ruth Boston Red Sox 1916 3
## 4 Babe Ruth Boston Red Sox 1917 2
## 5 Babe Ruth Boston Red Sox 1918 11
## 6 Babe Ruth Boston Red Sox 1919 29
## 7 Babe Ruth New York Yankees 1920 54
## 8 Babe Ruth New York Yankees 1921 59
## 9 Babe Ruth New York Yankees 1922 35
## 10 Babe Ruth New York Yankees 1923 41
## 11 Babe Ruth New York Yankees 1924 46
## 12 Babe Ruth New York Yankees 1925 25
## 13 Babe Ruth New York Yankees 1926 47
## 14 Babe Ruth New York Yankees 1927 60
## 15 Babe Ruth New York Yankees 1928 54
## 16 Babe Ruth New York Yankees 1929 46
## 17 Babe Ruth New York Yankees 1930 49
## 18 Babe Ruth New York Yankees 1931 46
## 19 Babe Ruth New York Yankees 1932 41
## 20 Babe Ruth New York Yankees 1933 34
## 21 Babe Ruth New York Yankees 1934 22
## 22 Babe Ruth Boston Braves 1935 6
Summing HR by player while joining Batting with Master
bdat <- Batting %>%
group_by(playerID) %>%
summarize(career_HR = sum(HR, na.rm = TRUE))
Join5 <- inner_join(bdat, Master, by = c("playerID")) %>%
select(nameFirst, nameLast, career_HR) %>%
arrange(desc(career_HR))
head(Join5)
## # A tibble: 6 x 3
## nameFirst nameLast career_HR
## <chr> <chr> <int>
## 1 Barry Bonds 762
## 2 Hank Aaron 755
## 3 Babe Ruth 714
## 4 Alex Rodriguez 696
## 5 Willie Mays 660
## 6 Ken Griffey 630
Alternative method for Join5
Join5.1 <- inner_join(Batting, Master, by = c("playerID")) %>%
group_by(playerID) %>%
summarize(First_name = nameFirst[1], Last_name = nameLast[1], career_HR = sum(HR, na.rm = TRUE)) %>%
select(First_name, Last_name, career_HR) %>%
arrange(desc(career_HR))
head(Join5.1)
## # A tibble: 6 x 3
## First_name Last_name career_HR
## <chr> <chr> <int>
## 1 Barry Bonds 762
## 2 Hank Aaron 755
## 3 Babe Ruth 714
## 4 Alex Rodriguez 696
## 5 Willie Mays 660
## 6 Ken Griffey 630
Project1 <-
Generate a list of all the players who played for a team whose park was Petco Park.
Each such player should be listed with his first and last name.
No player should be listed more than once.
Limit the list to players in the Batting table.
tdat <- Teams %>%
filter(park == "Petco Park") %>%
select(teamID, yearID)
We realize the dates for Petco are 2004 thru 2016 so we can filter directly from the Batting data
bdat <- Batting %>%
filter(teamID == "SDN", yearID >= 2004, yearID <= 2016) %>%
select(playerID)
Project1 <- inner_join(bdat, Master, by = c("playerID")) %>%
group_by(playerID) %>%
summarize(First_name = nameFirst[1], Last_name = nameLast[1]) %>%
select(First_name, Last_name)
summary(Project1)
## First_name Last_name
## Length:369 Length:369
## Class :character Class :character
## Mode :character Mode :character
Project2 <-
List all players named “Bob” who averaged more than $1 Million per year in salary for their career.
Include last names.
bdat <- inner_join(Batting, Master, by = c("playerID")) %>%
group_by(playerID) %>%
summarize(First_name = nameFirst[1], Last_name = nameLast[1]) %>%
select(playerID, First_name, Last_name)
salary <- Salaries %>%
group_by(playerID) %>%
summarize(AVG_Salary = mean(salary)) %>%
filter(AVG_Salary > 1000000) %>%
select(playerID, AVG_Salary)
Project2 <- inner_join(salary, bdat, by = c("playerID")) %>%
select(First_name, Last_name, AVG_Salary) %>%
filter(First_name == "Bob") %>%
arrange(desc(AVG_Salary))
Project2
## # A tibble: 8 x 3
## First_name Last_name AVG_Salary
## <chr> <chr> <dbl>
## 1 Bob Wickman 2635812.
## 2 Bob Welch 1930417.
## 3 Bob Howry 1804583.
## 4 Bob Horner 1416667.
## 5 Bob Tewksbury 1288182.
## 6 Bob Stanley 1067500
## 7 Bob Boone 1047580
## 8 Bob Ojeda 1035926.

Much of the learnings were acquired through Udemy’s “Baseball Database Queries with SQL and dplyr” taught by Charles Redmond.
link