@inproceedings{Xu-Seghier-Millour-Gonzalez-Gallardo-Antoine:CORIA-TALN-2026:2026,
    author = "Xu, Ziyan and Seghier, Marina and Millour, Alice and Gonz\'alez-Gallardo, Carlos-Emiliano and Antoine, Jean-Yves",
    title = "\'Evaluation de l'adaptabilit\'e des grands mod\`eles de langage aux genres linguistiques attest\'es (AGLAGLA)",
    booktitle = "Actes de CORIA-TALN 2026. Actes des 33\`eme Conf\'erence sur le Traitement Automatique des Langues Naturelles.  Volume 2 : articles d\'ej\`a publi\'es",
    month = "6",
    year = "2026",
    address = "Nantes, France",
    publisher = "Association pour le Traitement Automatique des Langues",
    pages = "18-19",
    note = "",
    abstract = "The issue of adapting models to the linguistic context, particularly to textual genre or domain, has long been central in machine learning techniques in NLP. It is often considered that large language models (LLM) have such generalization capabilities that they are no longer concerned with this problem. However, their ability to adapt to textual variation remains underexplored, preventing full confirmation of this hypothesis. Our study addresses this question through the task of named entity recognition (NER) in French, conducted on NEM.fr, a multi-genre reference corpus that we specifically developed to evaluate the robustness of NER systems across diverse linguistic contexts. The NEM.fr corpus covers 11 text types, ranging from legal and encyclopedic prose to poetry, including political speech, spontaneous speech, and online exchanges. We evaluate the DeepSeek R1 reasoning-oriented model across six prompt configurations (zero-shot, one-shot, and few-shot, with and without chain-of-thought reasoning), while keeping the named entity annotation scheme, prompt format, and evaluation pipeline constant to isolate the role of genre variation. Performance is measured using the F1 measure, in both strict and fuzzy versions, depending on the boundaries of detected entities. The results show that prompt choices have little effect once the model has learned the task format, but that differences in text type strongly influence the results: fuzzy F1 scores range from about 0.85 in formal genres to less than 0.20 in informal genres. Even under strictly controlled conditions, LLM behavior proves highly sensitive to textual regularity and stylistic variation, highlighting textual genre as a key factor in evaluating model robustness and suggesting that LLMs, too, remain concerned with this issue.",
    keywords = "Reconnaissance d'entit\'es nomm\'ees, Grands mod\`eles de langue, Extraction d'informations",
    url = "9.pdf"
}
