@unpublished{ficarra2025disributional,title={{A Distributional Perspective on Word Learning in Neural Language Models}},author={Ficarra, Filippo and Cotterell, Ryan and Warstadt, Alex},year={2025},annote={Unpublished manuscript},}
Manuscript
Can Language Models Learn Typologically Implausible Languages?
Tianyang Xu, Tatsuki Kuribayashi, Yohei Oseki, and 2 more authors
@unpublished{xu2025implausible,title={{Can Language Models Learn Typologically Implausible Languages?}},author={Xu, Tianyang and Kuribayashi, Tatsuki and Oseki, Yohei and Cotterell, Ryan and Warstadt, Alex},year={2025},annote={Unpublished manuscript},}
2024
TACL (to appear)
Investigating Critical Period Effects in Language Acquisition through Neural Language Models
Ionut Constantinescu, Tiago Pimentel, Ryan Cotterell, and 1 more author
Transactions of the Association for Computational Linguistics, 2024
Humans appear to have a critical period (CP) for language acquisition: Second language (L2) acquisition becomes harder after early childhood, and ceasing exposure to a first language (L1) after this period (but not before) typically does not lead to substantial loss of L1 proficiency. It is unknown whether these CP effects result from innately determined brain maturation or as a stabilization of neural connections naturally induced by experience. In this study, we use language models (LMs) to test the extent to which these phenomena are peculiar to humans, or shared by a broader class of language learners. We vary the age of exposure by training LMs on language pairs in various experimental conditions, and find that LMs, which lack any direct analog to innate maturational stages, do not show CP effects when the age of exposure of L2 is delayed. Our results contradict the claim that CP effects are an inevitable result of statistical learning, and they are consistent with an innate mechanism for CP effects. We show that we can reverse-engineer the CP by introducing a regularizer partway through training to simulate a maturational decrease in plasticity. All in all, our results suggest that L1 learning on its own may not be enough to induce a CP, and additional engineering is necessary to make language models more cognitively plausible.
@article{constantinescu2024critical,title={Investigating {Critical} {Period} {Effects} in {Language} {Acquisition} through {Neural} {Language} {Models}},language={en},urldate={2024-10-23},journal={Transactions of the Association for Computational Linguistics},author={Constantinescu, Ionut and Pimentel, Tiago and Cotterell, Ryan and Warstadt, Alex},year={2024},keywords={Computer Science - Computation and Language},}
COLING
Automatic annotation of grammaticality in child-caregiver conversations
Mitja Nikolaus, Abhishek Agrawal, Petros Kaklamanis, and 2 more authors
In Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation (LREC-COLING 2024), May 2024
The acquisition of grammar has been a central question to adjudicate between theories of language acquisition. In order to conduct faster, more reproducible, and larger-scale corpus studies on grammaticality in child-caregiver conversations, tools for automatic annotation can offer an effective alternative to tedious manual annotation. We propose a coding scheme for context-dependent grammaticality in child-caregiver conversations and annotate more than 4,000 utterances from a large corpus of transcribed conversations. Based on these annotations, we train and evaluate a range of NLP models. Our results show that fine-tuned Transformer-based models perform best, achieving human inter-annotation agreement levels. As a first application and sanity check of this tool, we use the trained models to annotate a corpus almost two orders of magnitude larger than the manually annotated data and verify that children’s grammaticality shows a steady increase with age. This work contributes to the growing literature on applying state-of-the-art NLP methods to help study child language acquisition at scale.
@inproceedings{nikolaus2024automatic,address={Torino, Italia},title={Automatic annotation of grammaticality in child-caregiver conversations},booktitle={Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation ({LREC}-{COLING} 2024)},publisher={ELRA and ICCL},author={Nikolaus, Mitja and Agrawal, Abhishek and Kaklamanis, Petros and Warstadt, Alex and Fourtassi, Abdellah},editor={Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen},month=may,year={2024},pages={1832--1844},}
Under Review
Bigger is not always better: The importance of human-scale language modeling for psycholinguistics
Ethan Gotlieb Wilcox, Michael Hu, Aaron Mueller, and 6 more authors
@unpublished{wilcox2024bigger,title={Bigger is not always better: {The} importance of human-scale language modeling for psycholinguistics},author={Wilcox, Ethan Gotlieb and Hu, Michael and Mueller, Aaron and Linzen, Tal and Warstadt, Alex and Choshen, Leshem and Zhuang, Chengxu and Cotterell, Ryan and Williams, Adina},year={2024},}
EMNLP
Surprise! Uniform Information Density Isn’t the Whole Story: Predicting Surprisal Contours in Long-form Discourse
Eleftheria Tsipidi, Franz Nowak, Ryan Cotterell, and 3 more authors
In Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP), Nov 2024
The Uniform Information Density (UID) hypothesis posits that speakers tend to distribute information evenly across linguistic units to achieve efficient communication. Of course, information rate in texts and discourses is not perfectly uniform. While these fluctuations can be viewed as theoretically uninteresting noise on top of a uniform target, another explanation is that UID is not the only functional pressure regulating information content in a language. Speakers may also seek to maintain interest, adhere to writing conventions, and build compelling arguments. In this paper, we propose one such functional pressure; namely that speakers modulate information rate based on location within a hierarchically-structured model of discourse. We term this the Structured Context Hypothesis and test it by predicting the surprisal contours of naturally occurring discourses extracted from large language models using predictors derived from discourse structure. We find that hierarchical predictors are significant predictors of a discourse’s information contour and that deeply nested hierarchical predictors are more predictive than shallow ones. This work takes an initial step beyond UID to propose testable hypotheses for why the information rate fluctuates in predictable ways
@inproceedings{tsipidi2024surprise,address={Miami, USA},title={{S}urprise! {U}niform {I}nformation {D}ensity Isn't the Whole Story: {P}redicting Surprisal Contours in Long-form Discourse},doi={10.18653/v1/2023.findings-acl.523},booktitle={Proceedings of the 2023 conference on empirical methods in natural language processing ({EMNLP})},publisher={Association for Computational Linguistics},author={Tsipidi, Eleftheria and Nowak, Franz and Cotterell, Ryan and Wilcox, Ethan and Giulianelli, Mario and Warstadt, Alex},month=nov,year={2024},}
Many popular feature-attribution methods for interpreting deep neural networks rely on computing the gradients of a model’s output with respect to its inputs. While these methods can indicate which input features may be important for the model’s prediction, they reveal little about the inner workings of the model itself. In this paper, we observe that the gradient computation of a model is a special case of a more general formulation using semirings. This observation allows us to generalize the backpropagation algorithm to efficiently compute other interpretable statistics about the gradient graph of a neural network, such as the highest-weighted path and entropy. We implement this generalized algorithm, evaluate it on synthetic datasets to better understand the statistics it computes, and apply it to study BERT’s behavior on the subject–verb number agreement task (SVA). With this method, we (a) validate that the amount of gradient flow through a component of a model reflects its importance to a prediction and (b) for SVA, identify which pathways of the self-attention mechanism are most important.
@inproceedings{du2023generalizing,address={Toronto, Canada},title={Generalizing backpropagation for gradient-based interpretability},doi={10.18653/v1/2023.acl-long.669},booktitle={Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: {Long} papers)},publisher={Association for Computational Linguistics},author={Du, Kevin and Torroba Hennigen, Lucas and Stoehr, Niklas and Warstadt, Alex and Cotterell, Ryan},editor={Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki},month=jul,year={2023},pages={11979--11995},}
BabyLM
WhisBERT: Multimodal text-audio language modeling on 100M words
Lukas Wolf, Klemen Kotar, Greta Tuckute, and 4 more authors
In Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{wolf2023whisbert,address={Singapore},title={{WhisBERT}: {Multimodal} text-audio language modeling on {100M} words},doi={10.18653/v1/2023.conll-babylm.21},booktitle={Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning},publisher={Association for Computational Linguistics},author={Wolf, Lukas and Kotar, Klemen and Tuckute, Greta and Hosseini, Eghbal and I. Regev, Tamar and Gotlieb Wilcox, Ethan and Warstadt, Alexander Scott},editor={Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan},month=dec,year={2023},pages={253--258},}
BabyLM
Findings of the BabyLM challenge: Sample-efficient pretraining on developmentally plausible corpora
Alex Warstadt, Aaron Mueller, Leshem Choshen, and 8 more authors
In Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{warstadt2023findings,address={Singapore},title={Findings of the {BabyLM} challenge: {Sample}-efficient pretraining on developmentally plausible corpora},doi={10.18653/v1/2023.conll-babylm.1},booktitle={Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning},publisher={Association for Computational Linguistics},author={Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan},editor={Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan},month=dec,year={2023},pages={1--34},}
BabyLM
Acquiring linguistic knowledge from multimodal input
Theodor Amariucai, and Alexander Scott Warstadt
In Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{amariucai2023acquiring,address={Singapore},title={Acquiring linguistic knowledge from multimodal input},doi={10.18653/v1/2023.conll-babylm.11},booktitle={Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning},publisher={Association for Computational Linguistics},author={Amariucai, Theodor and Warstadt, Alexander Scott},editor={Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan},month=dec,year={2023},pages={128--141},}
BabyLM
Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning
@proceedings{warstadt2023babylm,title={Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning},editor={Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan},month=dec,year={2023},address={Singapore},publisher={Association for Computational Linguistics},}
2022
Book Chapter
What artificial neural networks can tell us about human language acquisition
Alex Warstadt, and Samuel R Bowman
In Algebraic Structures in Natural Language, Dec 2022
Rapid progress in machine learning for natural language processing has the potential to transform debates about how humans learn language. However, the learning environments and biases of current artificial learners and humans diverge in ways that weaken the impact of the evidence obtained from learning simulations. For example, today’s most effective neural language models are trained on roughly one thousand times the amount of linguistic data available to a typical child. To increase the relevance of learnability results from computational models, we need to train model learners without significant advantages over humans. If an appropriate model successfully acquires some target linguistic knowledge, it can provide a proof of concept that the target is learnable in a hypothesized human learning scenario. Plausible model learners will enable us to carry out experimental manipulations to make causal inferences about variables in the learning environment, and to rigorously test poverty-of-the-stimulus-style claims arguing for innate linguistic knowledge in humans. Comparable experiments will never be possible with human subjects due to practical and ethical considerations. So far, attempts to deprive current models of unfair advantages fail to achieve human-level grammatical knowledge. But before we can justifiably conclude that language learning requires more prior domain-specific knowledge than current models possess, we must first explore other training regimes as ways to make computational learners more efficient at learning from limited linguistic input.
@incollection{warstadt2022what,title={What artificial neural networks can tell us about human language acquisition},booktitle={Algebraic {Structures} in {Natural} {Language}},publisher={CRC Press},author={Warstadt, Alex and Bowman, Samuel R},editor={Lappin, Shalom and Bernardy, Jean-Philippe},year={2022},pages={17--60},}
Diss. Chapter
The Role of Indirect Evidence in Grammar Learning: Investigations with Causal Manipulations of the Learning Environment
Alex Warstadt
In Artificial neural networks as models of human language acquisition, Dec 2022
@incollection{warstadt2022indirect,title={The {Role} of {Indirect} {Evidence} in {Grammar} {Learning}: {Investigations} with {Causal} {Manipulations} of the {Learning} {Environment}},volume={Chapter 6},booktitle={Artificial neural networks as models of human language acquisition},publisher={PhD Dissertation, New York University},author={Warstadt, Alex},year={2022},}
2021
ACL
When Do You Need Billions of Words of Pretraining Data?
Yian Zhang, Alex Warstadt, Xiaocheng Li, and 1 more author
In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Aug 2021
NLP is currently dominated by language models like RoBERTa which are pretrained on billions of words. But what exact knowledge or skills do Transformer LMs learn from large-scale pretraining that they cannot learn from less data? To explore this question, we adopt five styles of evaluation: classifier probing, information-theoretic probing, unsupervised relative acceptability judgments, unsupervised language model knowledge probing, and fine-tuning on NLU tasks. We then draw learning curves that track the growth of these different measures of model ability with respect to pretraining data volume using the MiniBERTas, a group of RoBERTa models pretrained on 1M, 10M, 100M and 1B words. We find that these LMs require only about 10M to 100M words to learn to reliably encode most syntactic and semantic features we test. They need a much larger quantity of data in order to acquire enough commonsense knowledge and other skills required to master typical downstream NLU tasks. The results suggest that, while the ability to encode linguistic features is almost certainly necessary for language understanding, it is likely that other, unidentified, forms of knowledge are the major drivers of recent improvements in language understanding among large pretrained models.
@inproceedings{zhang2021when,address={Online},title={When {Do} {You} {Need} {Billions} of {Words} of {Pretraining} {Data}?},doi={10.18653/v1/2021.acl-long.90},webdate={2021-09-17},booktitle={Proceedings of the 59th {Annual} {Meeting} of the {Association} for {Computational} {Linguistics} and the 11th {International} {Joint} {Conference} on {Natural} {Language} {Processing} ({Volume} 1: {Long} {Papers})},publisher={Association for Computational Linguistics},author={Zhang, Yian and Warstadt, Alex and Li, Xiaocheng and Bowman, Samuel R.},month=aug,year={2021},pages={1112--1125},}