Load required libraries.

library(wordbankr)
library(langcog)
library(dplyr)
library(ggplot2)
library(directlabels)

Get administration data and filter to administrations of Words & Sentences that have sex/gender coded.

vocab_admins <- get_administration_data() %>%
  select(data_id, language, form, age, sex, production) %>%
  filter(form == "WS", !is.na(sex))

Get item information to find the number of items on each language’s form.

num_words <- get_item_data() %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(n = n())

Normalize productive vocabulary size as a proportion of items and calculate median vocabulary size for each language, sex/gender, and age.

vocab_data <- vocab_admins %>%
  left_join(num_words) %>%
  mutate(production = as.numeric(production) / n) %>%
  group_by(language, sex, age) %>%
  summarise(median = median(production))

Plot vocabulary size over age by gender.

ggplot(filter(vocab_data, language != "Hebrew"),
       aes(x = age, y = median, colour = sex, label = sex)) +
  facet_wrap(~ language) +
  geom_line(size = 1) +
  scale_colour_solarized() +
  scale_x_continuous(breaks = seq(min(vocab_data$age), max(vocab_data$age), 2),
                     limits = c(min(vocab_data$age), max(vocab_data$age) + 1),
                     name = "\nAge (months)") +
  scale_y_continuous(name = "Median Productive Vocabulary (proportion of total words)\n",
                     limits=c(0,1)) +
  theme_bw(base_size=14) +
  theme(legend.position = "none",
        text = element_text(family = "Open Sans")) +
  geom_dl(method = list(dl.trans(x = x + 0.2), "last.qp", cex = 1, fontfamily = "Open Sans"))