TyT2019W35 - tvthemes: Les Simpsons

By Johanie Fournier, agr. in rstats tidyverse tidytuesday

August 27, 2019

Get the data

simpsons <- readr::read_delim("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-27/simpsons-guests.csv", delim = "|", quote = "")
## Rows: 1386 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (6): season, number, production_code, episode_title, guest_star, role
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the data

summary(simpsons)
##     season             number          production_code    episode_title     
##  Length:1386        Length:1386        Length:1386        Length:1386       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##   guest_star            role          
##  Length:1386        Length:1386       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Prepare the data

#Combien y a-t-il de saison?
inspect_season<-simpsons %>%
   inspect_cat()
season<-inspect_season$levels$season %>% 
  filter(!value=="Movie") %>%
 mutate(value=as.numeric(value)) 
#aucune données pour les saisons 15 et 20

#Identifier l'acteur qui revient le plus souvent dans tous les épisodes
simpsons_freq <- simpsons %>%
  inspect_cat()
acteur<-simpsons_freq$levels$guest_star 
#Marcia Wallace

simpsons_cat <- simpsons %>% 
 mutate(season=as.numeric(season)) %>% 
 mutate(acteur=ifelse(guest_star %in% c("Marcia Wallace"),"Marcia Wallace","Autre")) %>% 
 group_by(season, acteur) %>% 
 summarise(nb=dplyr::n()) %>% 
 filter(!season %in% NA) %>% 
 spread(acteur, nb) %>% 
 mutate(`Marcia Wallace`=ifelse(is.na(`Marcia Wallace`), 0, `Marcia Wallace`)) %>% 
 mutate(pct_Marcia=`Marcia Wallace`/(`Marcia Wallace`+Autre)*100) %>% 
 mutate(pct_autre=Autre/(`Marcia Wallace`+Autre)*100) %>% 
 select(season,pct_Marcia,pct_autre) %>% 
 gather(acteur, pourcentage, -season) %>% 
 ungroup() %>% 
 add_row(season=15, acteur="pct_autre", pourcentage=100) %>% 
 add_row(season=20, acteur="pct_autre", pourcentage=100)

#Pour mettre les étiquettes de données seulement sur les valeurs concernant Marcia Wallace
simpson_MW<-simpsons_cat %>% 
  filter(acteur=="pct_Marcia") %>% 
  mutate(pourcentage=ifelse(pourcentage==0, NA, pourcentage))

Visualize the data

#Graphique 1: 
#loadfonts()
gg<-ggplot(simpsons_cat, aes(x=season, y=pourcentage, group=acteur, fill=acteur))
gg <- gg + geom_bar(stat="identity", position="stack", width=0.80,color="#FEE8C8", fill="#7199E1" )
gg <- gg + geom_bar(data=simpson_MW, aes(x=season, y=pourcentage, group=acteur, fill=acteur),stat="identity", position="stack", width=0.85,color="#FEE8C8", fill="#FEE8C8")
#ajouter les étiquettes de données
gg<-gg + geom_text(data=simpson_MW, aes(x=season, y=pourcentage, label=paste0(round(simpson_MW$pourcentage,0),"%", sep="")),
                     color="#46732EFF", size=3, position = position_stack(vjust = 0.5), family="Akbar", fontface="bold")
gg<- gg + coord_flip()
#retirer la légende
gg <- gg + theme(legend.position = "none")
#ajuster les axes 
gg <- gg + scale_y_continuous(breaks=seq(0,100,100), limits = c(0, 103), expand = c(0, 0), labels = function(x) paste0(x, "%"))
gg <- gg + scale_x_continuous(breaks=seq(1,30,1), limits = c(0, 31), expand = c(0, 0))
#modifier le thème
gg <- gg +  theme(panel.border = element_blank(),
                    panel.background = element_rect(fill="#7199E1"),
                    plot.background = element_rect(fill ="#7199E1"),
                    panel.grid.major.x= element_blank(),
                    panel.grid.major.y= element_blank(),
                    panel.grid.minor = element_blank(),
                    axis.line.x = element_blank(),
                    axis.line.y = element_blank(),
                    axis.ticks.y = element_blank(),
                    axis.ticks.x = element_blank())
#ajouter les titres
gg<-gg + labs(title="The Simpsons: Do you like Edna?",
              subtitle="\nMarcia Wallace is the actress who is invited most often in this series. She\nis present in 13% of the episodes. During the seasons, her presence varies.\nIt has increased to 27% from the 1st to the 9th season to decrease to\n9% in the 25th season, the last season before her death.",
              x="Seasons", 
              y=" ", 
              caption="SOURCE:  Wikipedia  |  DESIGN: Johanie Fournier, agr.")
gg<-gg + theme(plot.title      = element_text(size=27, hjust=0,vjust=0, face="bold", family="Akbar", color="#fed90f"),
                 plot.subtitle = element_text(size=12, hjust=0,vjust=0, family="Akbar", color="#46732EFF"),
                 plot.caption =  element_text(size=8, hjust=1,vjust=0, family="Akbar", color="#D1C19E"),
                 axis.title.y  = element_text(size=10, hjust=1,vjust=0.5, family="Akbar", color="#FEE8C8"),
                 axis.title.x  = element_blank(),
                 axis.text.x   = element_text(size=10, hjust=0.5,vjust=0.5, family="Akbar", color="#FEE8C8"), 
                 axis.text.y   = element_text(size=10, hjust=0.5,vjust=0.5, family="Akbar", color="#FEE8C8"))
Posted on:
August 27, 2019
Length:
3 minute read, 477 words
Categories:
rstats tidyverse tidytuesday
Tags:
rstats tidyverse tidytuesday
See Also:
Predicting MO with H2O Models from IRDA data
IRDA soil data
This is the begining of a cheat sheet!