#Homewrok10

Advanced ggplotting

Nicolás Zapata

# I started by loading the necessary packages 
library(tidyr)
library(ggbeeswarm)
## Loading required package: ggplot2
library(cowplot)
library(ggridges)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ─────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ───────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
# I explored the available data. I will use the data from the rolling stone album rankigs.

rolling_stone <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-05-07/rolling_stone.csv')
## Rows: 691 Columns: 21
## ── Column specification ───────────────────────────────────────────
## Delimiter: ","
## chr  (8): sort_name, clean_name, album, genre, type, spotify_url, artist_gen...
## dbl (13): rank_2003, rank_2012, rank_2020, differential, release_year, weeks...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(rolling_stone)
## spc_tbl_ [691 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ sort_name               : chr [1:691] "Sinatra, Frank" "Diddley, Bo" "Presley, Elvis" "Sinatra, Frank" ...
##  $ clean_name              : chr [1:691] "Frank Sinatra" "Bo Diddley" "Elvis Presley" "Frank Sinatra" ...
##  $ album                   : chr [1:691] "In the Wee Small Hours" "Bo Diddley / Go Bo Diddley" "Elvis Presley" "Songs for Swingin' Lovers!" ...
##  $ rank_2003               : num [1:691] 100 214 55 306 50 NA NA 421 NA 12 ...
##  $ rank_2012               : num [1:691] 101 216 56 308 50 NA 451 420 NA 12 ...
##  $ rank_2020               : num [1:691] 282 455 332 NA 227 32 33 NA 68 31 ...
##  $ differential            : num [1:691] -182 -241 -277 -195 -177 469 468 -80 433 -19 ...
##  $ release_year            : num [1:691] 1955 1955 1956 1956 1957 ...
##  $ genre                   : chr [1:691] "Big Band/Jazz" "Rock n' Roll/Rhythm & Blues" "Rock n' Roll/Rhythm & Blues" "Big Band/Jazz" ...
##  $ type                    : chr [1:691] "Studio" "Studio" "Studio" "Studio" ...
##  $ weeks_on_billboard      : num [1:691] 14 NA 100 NA 5 87 173 NA 27 NA ...
##  $ peak_billboard_position : num [1:691] 2 201 1 2 13 1 2 201 30 201 ...
##  $ spotify_popularity      : num [1:691] 48 50 58 62 64 73 67 47 75 52 ...
##  $ spotify_url             : chr [1:691] "spotify:album:3GmwKB1tgPZgXeRJZSm9WX" "spotify:album:1cbtDEwxCjMhglb49OgNBR" "spotify:album:7GXP5OhYyPVLmcVfO9Iqin" "spotify:album:4kca7vXd1Wo5GE2DMafvMc" ...
##  $ artist_member_count     : num [1:691] 1 1 1 1 1 1 1 4 1 1 ...
##  $ artist_gender           : chr [1:691] "Male" "Male" "Male" "Male" ...
##  $ artist_birth_year_sum   : num [1:691] 1915 1928 1935 1915 1932 ...
##  $ debut_album_release_year: num [1:691] 1946 1955 1956 1946 1957 ...
##  $ ave_age_at_top_500      : num [1:691] 40 27 21 41 25 35 23 19 27 33 ...
##  $ years_between           : num [1:691] 9 0 0 10 0 13 3 0 7 8 ...
##  $ album_id                : chr [1:691] "3GmwKB1tgPZgXeRJZSm9WX" "1cbtDEwxCjMhglb49OgNBR" "7GXP5OhYyPVLmcVfO9Iqin" "4kca7vXd1Wo5GE2DMafvMc" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   sort_name = col_character(),
##   ..   clean_name = col_character(),
##   ..   album = col_character(),
##   ..   rank_2003 = col_double(),
##   ..   rank_2012 = col_double(),
##   ..   rank_2020 = col_double(),
##   ..   differential = col_double(),
##   ..   release_year = col_double(),
##   ..   genre = col_character(),
##   ..   type = col_character(),
##   ..   weeks_on_billboard = col_double(),
##   ..   peak_billboard_position = col_double(),
##   ..   spotify_popularity = col_double(),
##   ..   spotify_url = col_character(),
##   ..   artist_member_count = col_double(),
##   ..   artist_gender = col_character(),
##   ..   artist_birth_year_sum = col_double(),
##   ..   debut_album_release_year = col_double(),
##   ..   ave_age_at_top_500 = col_double(),
##   ..   years_between = col_double(),
##   ..   album_id = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(rolling_stone)
## # A tibble: 6 × 21
##   sort_name      clean_name     album rank_2003 rank_2012 rank_2020 differential
##   <chr>          <chr>          <chr>     <dbl>     <dbl>     <dbl>        <dbl>
## 1 Sinatra, Frank Frank Sinatra  In t…       100       101       282         -182
## 2 Diddley, Bo    Bo Diddley     Bo D…       214       216       455         -241
## 3 Presley, Elvis Elvis Presley  Elvi…        55        56       332         -277
## 4 Sinatra, Frank Frank Sinatra  Song…       306       308        NA         -195
## 5 Little Richard Little Richard Here…        50        50       227         -177
## 6 Beyonce        Beyonce        Lemo…        NA        NA        32          469
## # ℹ 14 more variables: release_year <dbl>, genre <chr>, type <chr>,
## #   weeks_on_billboard <dbl>, peak_billboard_position <dbl>,
## #   spotify_popularity <dbl>, spotify_url <chr>, artist_member_count <dbl>,
## #   artist_gender <chr>, artist_birth_year_sum <dbl>,
## #   debut_album_release_year <dbl>, ave_age_at_top_500 <dbl>,
## #   years_between <dbl>, album_id <chr>
skimr::skim(rolling_stone) 
Data summary
Name rolling_stone
Number of rows 691
Number of columns 21
_______________________
Column type frequency:
character 8
numeric 13
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sort_name 0 1.00 1 34 0 391 0
clean_name 0 1.00 1 34 0 386 0
album 0 1.00 1 69 0 685 0
genre 164 0.76 5 35 0 16 0
type 0 1.00 4 13 0 5 0
spotify_url 36 0.95 22 36 0 655 0
artist_gender 5 0.99 4 11 0 3 0
album_id 0 1.00 6 22 0 691 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
rank_2003 191 0.72 250.50 144.49 1 125.75 250.5 375.25 500 ▇▇▇▇▇
rank_2012 191 0.72 250.50 144.48 1 125.75 250.5 375.25 500 ▇▇▇▇▇
rank_2020 191 0.72 250.50 144.48 1 125.75 250.5 375.25 500 ▇▇▇▇▇
differential 0 1.00 -12.32 199.04 -501 -137.50 -8.0 106.00 484 ▂▅▇▃▂
release_year 0 1.00 1982.87 14.55 1955 1971.00 1979.0 1994.00 2019 ▂▇▃▃▂
weeks_on_billboard 119 0.83 64.27 75.14 1 20.75 44.5 81.00 741 ▇▁▁▁▁
peak_billboard_position 0 1.00 61.19 77.16 1 2.00 17.0 111.50 201 ▇▁▁▁▂
spotify_popularity 37 0.95 55.81 14.95 10 46.00 57.0 68.00 91 ▁▃▇▇▂
artist_member_count 5 0.99 2.75 2.02 1 1.00 2.0 4.00 12 ▇▅▁▁▁
artist_birth_year_sum 5 0.99 5363.21 3947.13 1910 1948.00 3896.0 7845.00 23368 ▇▅▁▁▁
debut_album_release_year 5 0.99 1976.87 14.96 1934 1966.25 1973.0 1989.00 2019 ▁▇▇▅▂
ave_age_at_top_500 5 0.99 29.61 9.35 17 24.04 27.0 31.00 88 ▇▂▁▁▁
years_between 5 0.99 5.93 8.42 0 1.00 3.0 7.00 54 ▇▁▁▁▁
# Check the data, especially the column names.

colnames(rolling_stone)
##  [1] "sort_name"                "clean_name"              
##  [3] "album"                    "rank_2003"               
##  [5] "rank_2012"                "rank_2020"               
##  [7] "differential"             "release_year"            
##  [9] "genre"                    "type"                    
## [11] "weeks_on_billboard"       "peak_billboard_position" 
## [13] "spotify_popularity"       "spotify_url"             
## [15] "artist_member_count"      "artist_gender"           
## [17] "artist_birth_year_sum"    "debut_album_release_year"
## [19] "ave_age_at_top_500"       "years_between"           
## [21] "album_id"
rolling_stone_clean <- na.omit(rolling_stone)
# I tested different graphs

boxplot_plot <- ggplot(rolling_stone_clean, aes(x = genre, y = rank_2020, fill = genre)) +
  geom_boxplot(outlier.colour = "red", outlier.size = 2, alpha = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Ranking album (2020)",
       x = "Genre",
       y = "Ranking") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(boxplot_plot)
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

violin_plot <- ggplot(rolling_stone_clean, aes(x = genre, y = rank_2020, fill = genre)) +
  geom_violin(alpha = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Ranking album (2020)",
       x = "Genre",
       y = "Ranking") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(violin_plot)
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position
##   adjustment purposes.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

library(ggridges)
ridgeline_plot <- ggplot(rolling_stone_clean, aes(y = genre, x = rank_2020, fill = genre)) +
  geom_density_ridges(alpha = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Ranking album (2020)",
       x = "Ranking",
       y = "Genre") +
  theme(axis.text.y = element_text(size = 8))

print(ridgeline_plot)
## Picking joint bandwidth of 59.4
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

combined_plot <- plot_grid(boxplot_plot, violin_plot, labels = "AUTO")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position
##   adjustment purposes.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
print(combined_plot)