{
  "service": "elmekki-site-api",
  "resource": "publications",
  "generated_at": "2026-05-03T22:58:18-07:00",
  "total": 25,
  "items": [


    {
      "id": "lqm-linguistically-motivated-multidimensional-quality-metrics-for-machine-translation",
      "title": "LQM: Linguistically Motivated Multidimensional Quality Metrics for Machine Translation",
      "authors": "Samar M. Magdy, Fakhraddin Alwajih, Abdellah El Mekki, Wesam El-Sayed, Muhammad Abdul-Mageed",
      "venue": "The 64th Annual Meeting of the Association for Computational Linguistics: ACL 2026 (Long Papers Findings)",
      "date": "2026-07-05",
      "year": 2026,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#lqm-linguistically-motivated-multidimensional-quality-metrics-for-machine-translation",
      "paper_url": "https://arxiv.org/abs/2604.18490",
      "abstract": "Existing MT evaluation frameworks, including automatic metrics and human evaluation schemes such as Multidimensional Quality Metrics (MQM), are largely language-agnostic. However, they often fail to capture dialect- and culture-specific errors in diglossic languages such as Arabic, where translation failures stem from mismatches in language variety, content coverage, and pragmatic appropriateness rather than surface form alone. We introduce LQM: Linguistically Motivated Multidimensional Quality Metrics for MT. LQM is a hierarchical error taxonomy for diagnosing MT errors through six linguistically grounded levels: sociolinguistics, pragmatics, semantics, morphosyntax, orthography, and graphetics. We construct a bidirectional parallel corpus of 3,850 sentences spanning seven Arabic dialects, derived from conversational, culturally rich content. We evaluate six LLMs in a zero-shot setting and conduct expert span-level human annotation using LQM, producing 6,113 labeled error spans across 3,495 unique erroneous sentences, along with severity-weighted quality scores. We complement this analysis with an automatic metric (spBLEU). Though validated here on Arabic, LQM is a language-agnostic framework designed to be easily applied to or adapted for other languages.",
      "bibtex": "@misc{magdy2026lqmlinguisticallymotivatedmultidimensional,\n      title={LQM: Linguistically Motivated Multidimensional Quality Metrics for Machine Translation},\n      author={Samar M. Magdy and Fakhraddin Alwajih and Abdellah El Mekki and Wesam El-Sayed and Muhammad Abdul-Mageed},\n      year={2026},\n      eprint={2604.18490},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL},\n      url={https://arxiv.org/abs/2604.18490},\n}\n"
    },


    {
      "id": "alexandria-a-multi-domain-dialectal-arabic-machine-translation-dataset-for-culturally-inclusive-and-linguistically-diverse-llms",
      "title": "Alexandria: A Multi-Domain Dialectal Arabic Machine Translation Dataset for Culturally Inclusive and Linguistically Diverse LLMs",
      "authors": "Abdellah El Mekki, Samar M. Magdy, Houdaifa Atou, Ruwa AbuHweidi, Baraah Qawasmeh, Omer Nacar, Thikra Al-hibiri, Razan Saadie, Hamzah Alsayadi, Nadia Ghezaiel Hammouda, Alshima Alkhazimi, Aya Hamod, Al-Yas Al-Ghafri, Wesam El-Sayed, Asila Al sharji, Mohamad Ballout, Anas Belfathi, Karim Ghaddar, Serry Sibaee, Alaa Aoun, Areej Asiri, Lina Abureesh, Ahlam Bashiti, Majdal Yousef, Abdulaziz Hafiz, Yehdih Mohamed, Emira Hamedtou, Brakehe Brahim, Rahaf Alhamouri, Youssef Nafea, Aya El Aatar, Walid Al-Dhabyani, Emhemed Hamed, Sara Shatnawi, Fakhraddin Alwajih, Khalid Elkhidir, Ashwag Alasmari, Abdurrahman Gerrio, Omar Alshahri, AbdelRahim A. Elmadany, Ismail Berrada, Amir Azad Adli Alkathiri, Fadi A Zaraket, Mustafa Jarrar, Yahya Mohamed El Hadj, Hassan Alhuzali, Muhammad Abdul-Mageed",
      "venue": "The 64th Annual Meeting of the Association for Computational Linguistics: ACL 2026 (Long Papers Main Conference)",
      "date": "2026-07-05",
      "year": 2026,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#alexandria-a-multi-domain-dialectal-arabic-machine-translation-dataset-for-culturally-inclusive-and-linguistically-diverse-llms",
      "paper_url": "https://arxiv.org/abs/2601.13099",
      "abstract": "Arabic is a highly diglossic language where most daily communication occurs in regional dialects rather than Modern Standard Arabic (MSA). Despite this, machine translation (MT) systems often generalize poorly to dialectal input, limiting their utility for millions of speakers. We introduce Alexandria, a large-scale, community-driven, human-translated dataset designed to bridge this gap. Alexandria covers 13 Arab countries and 11 high-impact domains, including health, education, and agriculture. Unlike previous resources, Alexandria provides unprecedented granularity by associating contributions with city-of-origin metadata, capturing authentic local varieties beyond coarse regional labels. The dataset consists of parallel English-Dialectal Arabic multi-turn conversational scenarios annotated with speaker-addressee gender configurations, enabling the study of gender-conditioned variation in dialectal use. Comprising 107K total turns, Alexandria serves as both a training resource and as a rigorous benchmark for evaluating MT and Large Language Models (LLMs). Our automatic and human evaluation benchmarks the current capabilities of Arabic-aware LLMs in translating across diverse Arabic dialects and sub-dialects while exposing significant persistent challenges. The Alexandria dataset, the creation prompts, the translation and revision guidelines, and the evaluation code are publicly available.",
      "bibtex": "@misc{mekki2026alexandriamultidomaindialectalarabic,\n      title={Alexandria: A Multi-Domain Dialectal Arabic Machine Translation Dataset for Culturally Inclusive and Linguistically Diverse LLMs},\n      author={Abdellah El Mekki and Samar M. Magdy and Houdaifa Atou and Ruwa AbuHweidi and Baraah Qawasmeh and Omer Nacar and Thikra Al-hibiri and Razan Saadie and Hamzah Alsayadi and Nadia Ghezaiel Hammouda and Alshima Alkhazimi and Aya Hamod and Al-Yas Al-Ghafri and Wesam El-Sayed and Asila Al sharji and Mohamad Ballout and Anas Belfathi and Karim Ghaddar and Serry Sibaee and Alaa Aoun and Areej Asiri and Lina Abureesh and Ahlam Bashiti and Majdal Yousef and Abdulaziz Hafiz and Yehdih Mohamed and Emira Hamedtou and Brakehe Brahim and Rahaf Alhamouri and Youssef Nafea and Aya El Aatar and Walid Al-Dhabyani and Emhemed Hamed and Sara Shatnawi and Fakhraddin Alwajih and Khalid Elkhidir and Ashwag Alasmari and Abdurrahman Gerrio and Omar Alshahri and AbdelRahim A. Elmadany and Ismail Berrada and Amir Azad Adli Alkathiri and Fadi A Zaraket and Mustafa Jarrar and Yahya Mohamed El Hadj and Hassan Alhuzali and Muhammad Abdul-Mageed},\n      year={2026},\n      eprint={2601.13099},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL},\n      url={https://arxiv.org/abs/2601.13099},\n}\n"
    },


    {
      "id": "pearl-a-multimodal-culturally-aware-arabic-instruction-dataset",
      "title": "Pearl: A Multimodal Culturally-Aware Arabic Instruction Dataset",
      "authors": "Fakhraddin Alwajih, Samar M. Magdy, Abdellah El Mekki, Omer Nacar, Youssef Nafea, Safaa Taher Abdelfadil, Abdulfattah Mohammed Yahya, Hamzah Luqman, Nada Almarwani, Samah Aloufi, Baraah Qawasmeh, Houdaifa Atou, Serry Sibaee, Hamzah A. Alsayadi, Walid Al-Dhabyani, Maged S. Al-shaibani, Aya El aatar, Nour Qandos, Rahaf Alhamouri, Samar Ahmad, Mohammed Anwar AL-Ghrawi, Aminetou Yacoub, Ruwa AbuHweidi, Vatimetou Mohamed Lemin, Reem Abdel-Salam, Ahlam Bashiti, Adel Ammar, Aisha Alansari, Ahmed Ashraf, Nora Alturayeif, Alcides Alcoba Inciarte, AbdelRahim A. Elmadany, Mohamedou Cheikh Tourad, Ismail Berrada, Mustafa Jarrar, Shady Shehata, Muhammad Abdul-Mageed",
      "venue": "Findings of the Association for Computational Linguistics: EMNLP 2025",
      "date": "2025-11-01",
      "year": 2025,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#pearl-a-multimodal-culturally-aware-arabic-instruction-dataset",
      "paper_url": "https://aclanthology.org/2025.findings-emnlp.1254.pdf",
      "abstract": "Mainstream large vision-language models (LVLMs) inherently encode cultural biases, highlighting the need for diverse multimodal datasets. To address this gap, we introduce PEARL, a large-scale Arabic multimodal dataset and benchmark explicitly designed for cultural understanding. Constructed through advanced agentic workflows and extensive human-in-the-loop annotations by 37 annotators from across the Arab world, PEARL comprises over 309K multimodal examples spanning ten culturally significant domains covering all Arab countries. We further provide two robust evaluation benchmarks (PEARL and PEARL-LITE) along with a specialized subset (PEARL-X) explicitly developed to assess nuanced cultural variations. Comprehensive evaluations on state-of-the-art open and proprietary LVLMs demonstrate that reasoning-centric instruction alignment substantially improves models’ cultural grounding compared to conventional scaling methods. PEARL establishes a foundational resource for advancing culturally-informed multimodal modeling research. All datasets and benchmarks are publicly available.",
      "bibtex": "@inproceedings{alwajih-etal-2025-pearl,\ntitle = \"Pearl: A Multimodal Culturally-Aware {A}rabic Instruction Dataset\",\nauthor = \"Alwajih, Fakhraddin  and\n  Magdy, Samar M.  and\n  El Mekki, Abdellah  and\n  Nacar, Omer  and\n  Nafea, Youssef  and\n  Abdelfadil, Safaa Taher  and\n  Yahya, Abdulfattah Mohammed  and\n  Luqman, Hamzah  and\n  Almarwani, Nada  and\n  Aloufi, Samah  and\n  Qawasmeh, Baraah  and\n  Atou, Houdaifa  and\n  Sibaee, Serry  and\n  Alsayadi, Hamzah A.  and\n  Al-Dhabyani, Walid  and\n  Al-shaibani, Maged S.  and\n  El aatar, Aya  and\n  Qandos, Nour  and\n  Alhamouri, Rahaf  and\n  Ahmad, Samar  and\n  AL-Ghrawi, Mohammed Anwar  and\n  Yacoub, Aminetou  and\n  AbuHweidi, Ruwa  and\n  Lemin, Vatimetou Mohamed  and\n  Abdel-Salam, Reem  and\n  Bashiti, Ahlam  and\n  Ammar, Adel  and\n  Alansari, Aisha  and\n  Ashraf, Ahmed  and\n  Alturayeif, Nora  and\n  Alcoba Inciarte, Alcides  and\n  Elmadany, AbdelRahim A.  and\n  Tourad, Mohamedou Cheikh  and\n  Berrada, Ismail  and\n  Jarrar, Mustafa  and\n  Shehata, Shady  and\n  Abdul-Mageed, Muhammad\",\neditor = \"Christodoulopoulos, Christos  and\n  Chakraborty, Tanmoy  and\n  Rose, Carolyn  and\n  Peng, Violet\",\nbooktitle = \"Findings of the Association for Computational Linguistics: EMNLP 2025\",\nmonth = nov,\nyear = \"2025\",\naddress = \"Suzhou, China\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2025.findings-emnlp.1254/\",\npages = \"23048--23079\",\nISBN = \"979-8-89176-335-7\",\nabstract = \"Mainstream large vision-language models (LVLMs) inherently encode cultural biases, highlighting the need for diverse multimodal datasets. To address this gap, we introduce PEARL, a large-scale Arabic multimodal dataset and benchmark explicitly designed for cultural understanding. Constructed through advanced agentic workflows and extensive human-in-the-loop annotations by 37 annotators from across the Arab world, PEARL comprises over 309K multimodal examples spanning ten culturally significant domains covering all Arab countries. We further provide two robust evaluation benchmarks (PEARL and PEARL-LITE) along with a specialized subset (PEARL-X) explicitly developed to assess nuanced cultural variations. Comprehensive evaluations on state-of-the-art open and proprietary LVLMs demonstrate that reasoning-centric instruction alignment substantially improves models' cultural grounding compared to conventional scaling methods. PEARL establishes a foundational resource for advancing culturally-informed multimodal modeling research. All datasets and benchmarks are publicly available.\"\n}\n"
    },


    {
      "id": "eduadapt-a-question-answer-benchmark-dataset-for-evaluating-grade-level-adaptability-in-llms",
      "title": "EduAdapt: A Question Answer Benchmark Dataset for Evaluating Grade-Level Adaptability in LLMs",
      "authors": "Numaan Naeem, Abdellah El Mekki, Muhammad Abdul-Mageed",
      "venue": "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: EMNLP 2025",
      "date": "2025-11-01",
      "year": 2025,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#eduadapt-a-question-answer-benchmark-dataset-for-evaluating-grade-level-adaptability-in-llms",
      "paper_url": "https://aclanthology.org/2025.emnlp-main.1736.pdf",
      "abstract": "Large language models (LLMs) are transforming education by answering questions, explaining complex concepts, and generating content across a wide range of subjects. Despite strong performance on academic benchmarks, they often fail to tailor responses to students’ grade levels. This is a critical need in K-12 education, where age-appropriate vocabulary and explanation are essential for effective learning. Existing models frequently produce outputs that are too advanced or vague for younger learners, and there are no standardized benchmarks to evaluate their ability to adjust across cognitive and developmental stages. To address this gap, we introduce EduAdapt, a benchmark of nearly 48k grade-labeled QA pairs across nine science subjects, spanning Grades 1-12 and grouped into four grade levels. We evaluate a diverse set of open-source LLMs on EduAdapt and find that while larger models generally perform better, they still struggle with generating suitable responses for early-grade students (Grades 1-5). Our work presents the first dataset and evaluation framework for assessing grade-level adaptability in LLMs, aiming to foster more developmentally aligned educational AI systems through better training and prompting strategies.",
      "bibtex": "@inproceedings{naeem-etal-2025-eduadapt,\ntitle = \"{E}du{A}dapt: A Question Answer Benchmark Dataset for Evaluating Grade-Level Adaptability in {LLM}s\",\nauthor = \"Naeem, Numaan  and\n  El Mekki, Abdellah  and\n  Abdul-Mageed, Muhammad\",\neditor = \"Christodoulopoulos, Christos  and\n  Chakraborty, Tanmoy  and\n  Rose, Carolyn  and\n  Peng, Violet\",\nbooktitle = \"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing\",\nmonth = nov,\nyear = \"2025\",\naddress = \"Suzhou, China\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2025.emnlp-main.1736/\",\npages = \"34224--34251\",\nISBN = \"979-8-89176-332-6\",\nabstract = \"Large language models (LLMs) are transforming education by answering questions, explaining complex concepts, and generating content across a wide range of subjects. Despite strong performance on academic benchmarks, they often fail to tailor responses to students' grade levels. This is a critical need in K-12 education, where age-appropriate vocabulary and explanation are essential for effective learning. Existing models frequently produce outputs that are too advanced or vague for younger learners, and there are no standardized benchmarks to evaluate their ability to adjust across cognitive and developmental stages. To address this gap, we introduce EduAdapt, a benchmark of nearly 48k grade-labeled QA pairs across nine science subjects, spanning Grades 1-12 and grouped into four grade levels. We evaluate a diverse set of open-source LLMs on EduAdapt and find that while larger models generally perform better, they still struggle with generating suitable responses for early-grade students (Grades 1-5). Our work presents the first dataset and evaluation framework for assessing grade-level adaptability in LLMs, aiming to foster more developmentally aligned educational AI systems through better training and prompting strategies. EduAdapt code and datasets are publicly available at https://github.com/NaumanNaeem/EduAdapt.\"\n}\n"
    },


    {
      "id": "palmx-2025-the-first-shared-task-on-benchmarking-llms-on-arabic-and-islamic-culture",
      "title": "PalmX 2025: The First Shared Task on Benchmarking LLMs on Arabic and Islamic Culture",
      "authors": "Fakhraddin Alwajih, Abdellah El Mekki, Hamdy Mubarak, Majd Hawasly, Abubakr Mohamed, Muhammad Abdul-Mageed",
      "venue": "Proceedings of The Third Arabic Natural Language Processing Conference: ArabicNLP 2025",
      "date": "2025-11-01",
      "year": 2025,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#palmx-2025-the-first-shared-task-on-benchmarking-llms-on-arabic-and-islamic-culture",
      "paper_url": "https://aclanthology.org/2025.arabicnlp-sharedtasks.107.pdf",
      "abstract": "Large Language Models (LLMs) inherently reflect the vast data distributions they encounter during their pre-training phase. As this data is predominantly sourced from the web, there is a high chance it will be skewed towards high-resourced languages and cultures, such as those of the West. Consequently, LLMs often exhibit a diminished understanding of certain communities, a gap that is particularly evident in their knowledge of Arabic and Islamic cultures. This issue becomes even more pronounced with increasingly under-represented topics. To address this critical challenge, we introduce PalmX 2025, the first shared task designed to benchmark the cultural competence of LLMs in these specific domains. The task is composed of two subtasks featuring multiple-choice questions (MCQs) in Modern Standard Arabic (MSA): General Arabic Culture and General Islamic Culture. These subtasks cover a wide range of topics, including traditions, food, history, religious practices, and language expressions from across 22 Arab countries. The initiative drew considerable interest, with 26 teams registering for Subtask 1 and 19 for Subtask 2, culminating in nine and six valid submissions, respectively. Our findings reveal that task-specific fine-tuning substantially boosts performance over baseline models. The top-performing systems achieved an accuracy of 72.15% on cultural questions and 84.22% on Islamic knowledge. Parameter-efficient fine-tuning emerged as the predominant and most effective approach among participants, while the utility of data augmentation was found to be domain-dependent. Ultimately, this benchmark provides a crucial, standardized framework to guide the development of more culturally grounded and competent Arabic LLMs. Results of the shared task demonstrate that general cultural and general religious knowledge remain challenging to LLMs, motivating us to continue to offer the shared task in the future.",
      "bibtex": "@inproceedings{alwajih-etal-2025-palmx,\ntitle = \"{P}alm{X} 2025: The First Shared Task on Benchmarking {LLM}s on {A}rabic and Islamic Culture\",\nauthor = \"Alwajih, Fakhraddin  and\n  El Mekki, Abdellah  and\n  Mubarak, Hamdy  and\n  Hawasly, Majd  and\n  Mohamed, Abubakr  and\n  Abdul-Mageed, Muhammad\",\neditor = \"Darwish, Kareem  and\n  Ali, Ahmed  and\n  Abu Farha, Ibrahim  and\n  Touileb, Samia  and\n  Zitouni, Imed  and\n  Abdelali, Ahmed  and\n  Al-Ghamdi, Sharefah  and\n  Alkhereyf, Sakhar  and\n  Zaghouani, Wajdi  and\n  Khalifa, Salam  and\n  AlKhamissi, Badr  and\n  Almatham, Rawan  and\n  Hamed, Injy  and\n  Alyafeai, Zaid  and\n  Alowisheq, Areeb  and\n  Inoue, Go  and\n  Mrini, Khalil  and\n  Alshammari, Waad\",\nbooktitle = \"Proceedings of The Third Arabic Natural Language Processing Conference: Shared Tasks\",\nmonth = nov,\nyear = \"2025\",\naddress = \"Suzhou, China\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2025.arabicnlp-sharedtasks.107/\",\npages = \"774--789\",\nISBN = \"979-8-89176-356-2\",\nabstract = \"Large Language Models (LLMs) inherently reflect the vast data distributions they encounter during their pre-training phase. As this data is predominantly sourced from the web, there is a high chance it will be skewed towards high-resourced languages and cultures, such as those of the West. Consequently, LLMs often exhibit a diminished understanding of certain communities, a gap that is particularly evident in their knowledge of Arabic and Islamic cultures. This issue becomes even more pronounced with increasingly under-represented topics. To address this critical challenge, we introduce PalmX 2025, the first shared task designed to benchmark the cultural competence of LLMs in these specific domains. The task is composed of two subtasks featuring multiple-choice questions (MCQs) in Modern Standard Arabic (MSA): General Arabic Culture and General Islamic Culture. These subtasks cover a wide range of topics, including traditions, food, history, religious practices, and language expressions from across 22 Arab countries. The initiative drew considerable interest, with 26 teams registering for Subtask 1 and 19 for Subtask 2, culminating in nine and six valid submissions, respectively. Our findings reveal that task-specific fine-tuning substantially boosts performance over baseline models. The top-performing systems achieved an accuracy of 72.15{\\%} on cultural questions and 84.22{\\%} on Islamic knowledge. Parameter-efficient fine-tuning emerged as the predominant and most effective approach among participants, while the utility of data augmentation was found to be domain-dependent. Ultimately, this benchmark provides a crucial, standardized framework to guide the development of more culturally grounded and competent Arabic LLMs. Results of the shared task demonstrate that general cultural and general religious knowledge remain challenging to LLMs, motivating us to continue to offer the shared task in the future.\"\n}\n"
    },


    {
      "id": "nilechat-towards-linguistically-diverse-and-culturally-aware-llms-for-local-communities",
      "title": "NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities",
      "authors": "Abdellah El Mekki, Houdaifa Atou, Omer Nacar, Shady Shehata, Muhammad Abdul-Mageed",
      "venue": "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: EMNLP 2025",
      "date": "2025-11-01",
      "year": 2025,
      "featured": true,
      "award": "",
      "url": "https://elmekki.me/articles/#nilechat-towards-linguistically-diverse-and-culturally-aware-llms-for-local-communities",
      "paper_url": "https://aclanthology.org/2025.emnlp-main.556.pdf",
      "abstract": "Enhancing the linguistic capabilities of Large Language Models (LLMs) to include low-resource languages is a critical research area. Current research directions predominantly rely on synthetic data generated by translating English corpora, which, while demonstrating promising linguistic understanding and translation abilities, often results in models aligned with source language culture. These models frequently fail to represent the cultural heritage and values of local communities. This work proposes a methodology to create both synthetic and retrieval-based pre-training data tailored to a specific community, considering its (i) language, (ii) cultural heritage, and (iii) cultural values. We demonstrate our methodology using Egyptian and Moroccan dialects as testbeds, chosen for their linguistic and cultural richness and current underrepresentation in LLMs. As a proof-of-concept, we develop NileChat, a 3B parameter Egyptian and Moroccan Arabic LLM adapted for Egyptian and Moroccan communities, incorporating their language, cultural heritage, and values. Our results on various understanding, translation, and cultural and values alignment benchmarks show that NileChat outperforms existing Arabic-aware LLMs of similar size and performs on par with larger models. This work addresses Arabic dialect in LLMs with a focus on cultural and values alignment via controlled synthetic data generation and retrieval-augmented pre-training for Moroccan Darija and Egyptian Arabic, including Arabizi variants, advancing Arabic NLP for low-resource communities.We share our methods, data, and models with the community to promote the inclusion and coverage of more diverse communities in cultural LLM development: https://github.com/UBC-NLP/nilechat.",
      "bibtex": "@inproceedings{el-mekki-etal-2025-nilechat,\ntitle = \"{N}ile{C}hat: Towards Linguistically Diverse and Culturally Aware {LLM}s for Local Communities\",\nauthor = \"El Mekki, Abdellah  and\n  Atou, Houdaifa  and\n  Nacar, Omer  and\n  Shehata, Shady  and\n  Abdul-Mageed, Muhammad\",\neditor = \"Christodoulopoulos, Christos  and\n  Chakraborty, Tanmoy  and\n  Rose, Carolyn  and\n  Peng, Violet\",\nbooktitle = \"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing\",\nmonth = nov,\nyear = \"2025\",\naddress = \"Suzhou, China\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2025.emnlp-main.556/\",\npages = \"10978--11002\",\nISBN = \"979-8-89176-332-6\",\nabstract = \"Enhancing the linguistic capabilities of Large Language Models (LLMs) to include low-resource languages is a critical research area. Current research directions predominantly rely on synthetic data generated by translating English corpora, which, while demonstrating promising linguistic understanding and translation abilities, often results in models aligned with source language culture. These models frequently fail to represent the cultural heritage and values of local communities. This work proposes a methodology to create both synthetic and retrieval-based pre-training data tailored to a specific community, considering its \\textit{(i) language}, \\textit{(ii) cultural heritage}, and \\textit{(iii) cultural values}. We demonstrate our methodology using Egyptian and Moroccan dialects as testbeds, chosen for their linguistic and cultural richness and current underrepresentation in LLMs. As a proof-of-concept, we develop \\textit{NileChat}, a 3B parameter Egyptian and Moroccan Arabic LLM adapted for Egyptian and Moroccan communities, incorporating their language, cultural heritage, and values. Our results on various understanding, translation, and cultural and values alignment benchmarks show that \\textit{NileChat} outperforms existing Arabic-aware LLMs of similar size and performs on par with larger models. This work addresses Arabic dialect in LLMs with a focus on cultural and values alignment via controlled synthetic data generation and retrieval-augmented pre-training for Moroccan Darija and Egyptian Arabic, including Arabizi variants, advancing Arabic NLP for low-resource communities.We share our methods, data, and models with the community to promote the inclusion and coverage of more diverse communities in cultural LLM development: https://github.com/UBC-NLP/nilechat.\"\n}\n"
    },


    {
      "id": "palm-a-culturally-inclusive-and-linguistically-diverse-dataset-for-arabic-llms",
      "title": "Palm: A Culturally Inclusive and Linguistically Diverse Dataset for Arabic LLMs",
      "authors": "Fakhraddin Alwajih, Abdellah El Mekki, Samar Mohamed Magdy, AbdelRahim A. Elmadany, Omer Nacar, El Moatez Billah Nagoudi, Reem Abdel-Salam, Hanin Atwany, Youssef Nafea, Abdulfattah Mohammed Yahya, Rahaf Alhamouri, Hamzah A. Alsayadi, Hiba Zayed, Sara Shatnawi, Serry Sibaee, Yasir Ech-chammakhy, Walid Al-Dhabyani, Marwa Mohamed Ali, Imen Jarraya, Ahmed Oumar El-Shangiti, Aisha Alraeesi, Mohammed Anwar AL-Ghrawi, Abdulrahman S. Al-Batati, Elgizouli Mohamed, Noha Taha Elgindi, Muhammed Saeed, Houdaifa Atou, Issam Ait Yahia, Abdelhak Bouayad, Mohammed Machrouh, Amal Makouar, Dania Alkawi, Mukhtar Mohamed, Safaa Taher Abdelfadil, Amine Ziad Ounnoughene, Anfel Rouabhia, Rwaa Assi, Ahmed Sorkatti, Mohamedou Cheikh Tourad, Anis Koubaa, Ismail Berrada, Mustafa Jarrar, Shady Shehata, Muhammad Abdul-Mageed",
      "venue": "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers): ACL 2025",
      "date": "2025-07-01",
      "year": 2025,
      "featured": false,
      "award": "Best Resource Paper Award",
      "url": "https://elmekki.me/articles/#palm-a-culturally-inclusive-and-linguistically-diverse-dataset-for-arabic-llms",
      "paper_url": "https://aclanthology.org/2025.acl-long.1579.pdf",
      "abstract": "As large language models (LLMs) become increasingly integrated into daily life, ensuring their cultural sensitivity and inclusivity is paramount. We introduce PALM, a year-long community-driven project covering all 22 Arab countries. The dataset contains instruction–response pairs in both Modern Standard Arabic (MSA) and dialectal Arabic (DA), spanning 20 diverse topics. Built by a team of 44 researchers across the Arab world—each an author of this paper—PALM offers a broad, inclusive perspective. We use PALM to evaluate the cultural and dialectal capabilities of several frontier LLMs, revealing notable limitations: while closed-source LLMs generally perform strongly, they still exhibit flaws, and smaller open-source models face greater challenges. Furthermore, certain countries (e.g., Egypt, the UAE) appear better represented than others (e.g., Iraq, Mauritania, Yemen). Our annotation guidelines, code, and data are publicly available for reproducibility. More information about PALM is available on our project page: https://github.com/UBC-NLP/palm.",
      "bibtex": "@inproceedings{alwajih-etal-2025-palm,\n    title = \"Palm: A Culturally Inclusive and Linguistically Diverse Dataset for {A}rabic {LLM}s\",\n    author = \"Alwajih, Fakhraddin  and\n      El Mekki, Abdellah  and\n      Magdy, Samar Mohamed  and\n      Elmadany, AbdelRahim A.  and\n      Nacar, Omer  and\n      Nagoudi, El Moatez Billah  and\n      Abdel-Salam, Reem  and\n      Atwany, Hanin  and\n      Nafea, Youssef  and\n      Yahya, Abdulfattah Mohammed  and\n      Alhamouri, Rahaf  and\n      Alsayadi, Hamzah A.  and\n      Zayed, Hiba  and\n      Shatnawi, Sara  and\n      Sibaee, Serry  and\n      Ech-chammakhy, Yasir  and\n      Al-Dhabyani, Walid  and\n      Ali, Marwa Mohamed  and\n      Jarraya, Imen  and\n      El-Shangiti, Ahmed Oumar  and\n      Alraeesi, Aisha  and\n      AL-Ghrawi, Mohammed Anwar  and\n      Al-Batati, Abdulrahman S.  and\n      Mohamed, Elgizouli  and\n      Elgindi, Noha Taha  and\n      Saeed, Muhammed  and\n      Atou, Houdaifa  and\n      Yahia, Issam Ait  and\n      Bouayad, Abdelhak  and\n      Machrouh, Mohammed  and\n      Makouar, Amal  and\n      Alkawi, Dania  and\n      Mohamed, Mukhtar  and\n      Abdelfadil, Safaa Taher  and\n      Ounnoughene, Amine Ziad  and\n      Rouabhia, Anfel  and\n      Assi, Rwaa  and\n      Sorkatti, Ahmed  and\n      Tourad, Mohamedou Cheikh  and\n      Koubaa, Anis  and\n      Berrada, Ismail  and\n      Jarrar, Mustafa  and\n      Shehata, Shady  and\n      Abdul-Mageed, Muhammad\",\n    editor = \"Che, Wanxiang  and\n      Nabende, Joyce  and\n      Shutova, Ekaterina  and\n      Pilehvar, Mohammad Taher\",\n    booktitle = \"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\",\n    month = jul,\n    year = \"2025\",\n    address = \"Vienna, Austria\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2025.acl-long.1579/\",\n    doi = \"10.18653/v1/2025.acl-long.1579\",\n    pages = \"32871--32894\",\n    ISBN = \"979-8-89176-251-0\",\n    abstract = \"As large language models (LLMs) become increasingly integrated into daily life, ensuring their cultural sensitivity and inclusivity is paramount. We introduce PALM, a year-long community-driven project covering all 22 Arab countries. The dataset contains instruction{--}response pairs in both Modern Standard Arabic (MSA) and dialectal Arabic (DA), spanning 20 diverse topics. Built by a team of 44 researchers across the Arab world{---}each an author of this paper{---}PALM offers a broad, inclusive perspective. We use PALM to evaluate the cultural and dialectal capabilities of several frontier LLMs, revealing notable limitations: while closed-source LLMs generally perform strongly, they still exhibit flaws, and smaller open-source models face greater challenges. Furthermore, certain countries (e.g., Egypt, the UAE) appear better represented than others (e.g., Iraq, Mauritania, Yemen). Our annotation guidelines, code, and data are publicly available for reproducibility. More information about PALM is available on our project page: https://github.com/UBC-NLP/palm.\"\n}\n"
    },


    {
      "id": "effective-self-mining-of-in-context-examples-for-unsupervised-machine-translation-with-llms",
      "title": "Effective Self-Mining of In-Context Examples for Unsupervised Machine Translation with LLMs",
      "authors": "Abdellah El Mekki, Muhammad Abdul-Mageed",
      "venue": "Findings of the Association for Computational Linguistics: NAACL 2025",
      "date": "2025-05-01",
      "year": 2025,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#effective-self-mining-of-in-context-examples-for-unsupervised-machine-translation-with-llms",
      "paper_url": "https://aclanthology.org/2025.findings-naacl.238.pdf",
      "abstract": "Large Language Models (LLMs) have demonstrated impressive performance on a wide range of natural language processing (NLP) tasks, primarily through in-context learning (ICL). In ICL, the LLM is provided with examples that represent a given task such that it learns to generate answers for test inputs. However, access to these in-context examples is not guaranteed especially for low-resource or massively multilingual tasks. In this work, we propose an unsupervised approach to mine in-context examples for machine translation (MT), enabling unsupervised MT (UMT) across different languages. Our approach begins with word-level mining to acquire word translations that are then used to perform sentence-level mining. As the quality of mined parallel pairs may not be optimal due to noise or mistakes, we introduce a filtering criterion to select the optimal in-context examples from a pool of unsupervised parallel sentences. We evaluate our approach using two multilingual LLMs on 288 directions from the FLORES-200 dataset (CITATION) and analyze the impact of various linguistic features on performance. Our findings demonstrate the effectiveness of our unsupervised approach in mining in-context examples for MT, leading to better or comparable translation performance as translation with regular in-context samples (extracted from human-annotated data), while also outperforming the other state-of-the-art UMT methods by an average of 7 BLEU points.",
      "bibtex": "@inproceedings{el-mekki-abdul-mageed-2025-effective,\n    title = \"Effective Self-Mining of In-Context Examples for Unsupervised Machine Translation with {LLM}s\",\n    author = \"El Mekki, Abdellah  and\n      Abdul-Mageed, Muhammad\",\n    editor = \"Chiruzzo, Luis  and\n      Ritter, Alan  and\n      Wang, Lu\",\n    booktitle = \"Findings of the Association for Computational Linguistics: NAACL 2025\",\n    month = apr,\n    year = \"2025\",\n    address = \"Albuquerque, New Mexico\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2025.findings-naacl.238/\",\n    doi = \"10.18653/v1/2025.findings-naacl.238\",\n    pages = \"4229--4256\",\n    ISBN = \"979-8-89176-195-7\",\n    abstract = \"Large Language Models (LLMs) have demonstrated impressive performance on a wide range of natural language processing (NLP) tasks, primarily through in-context learning (ICL). In ICL, the LLM is provided with examples that represent a given task such that it learns to generate answers for test inputs. However, access to these in-context examples is not guaranteed especially for low-resource or massively multilingual tasks. In this work, we propose an unsupervised approach to mine in-context examples for machine translation (MT), enabling unsupervised MT (UMT) across different languages. Our approach begins with word-level mining to acquire word translations that are then used to perform sentence-level mining. As the quality of mined parallel pairs may not be optimal due to noise or mistakes, we introduce a filtering criterion to select the optimal in-context examples from a pool of unsupervised parallel sentences. We evaluate our approach using two multilingual LLMs on 288 directions from the FLORES-200 dataset (CITATION) and analyze the impact of various linguistic features on performance. Our findings demonstrate the effectiveness of our unsupervised approach in mining in-context examples for MT, leading to better or comparable translation performance as translation with regular in-context samples (extracted from human-annotated data), while also outperforming the other state-of-the-art UMT methods by an average of 7 BLEU points.\"\n}\n"
    },


    {
      "id": "swan-and-arabicmteb-dialect-aware-arabic-centric-cross-lingual-and-cross-cultural-embedding-models-and-benchmarks",
      "title": "Swan and ArabicMTEB: Dialect-Aware, Arabic-Centric, Cross-Lingual, and Cross-Cultural Embedding Models and Benchmarks",
      "authors": "Gagan Bhatia, El Moatez Billah Nagoudi, Abdellah El Mekki, Fakhraddin Alwajih, Muhammad Abdul-Mageed",
      "venue": "Findings of the Association for Computational Linguistics: NAACL 2025",
      "date": "2025-05-01",
      "year": 2025,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#swan-and-arabicmteb-dialect-aware-arabic-centric-cross-lingual-and-cross-cultural-embedding-models-and-benchmarks",
      "paper_url": "https://aclanthology.org/2025.findings-naacl.263.pdf",
      "abstract": "In this paper, we introduce Swan, a family of embedding models centred around the Arabic language, addressing both small-scale and large-scale use cases. Swan includes two variants: Swan-Small, based on ARBERTv2, and Swan-Large, built on ArMistral, a pretrained Arabic large language model. To evaluate these models, we propose ArabicMTEB, a comprehensive benchmark suite that assesses cross-lingual, multi-dialectal, multi-domain, and multi-cultural Arabic text embedding performance, covering eight diverse tasks and spanning 94 datasets. Swan-Large achieves state-of-the-art results, outperforming Multilingual-E5-large in most Arabic tasks, while the Swan-Small consistently surpasses Multilingual-E5-base. Our extensive evaluations demonstrate that Swan models are dialectally and culturally aware, excelling across various Arabic domains while offering significant monetary efficiency. This work significantly advances the field of Arabic language modelling and provides valuable resources for future research and applications in Arabic natural language processing. Our models and benchmarks will be made publicly accessible for research.",
      "bibtex": "@inproceedings{bhatia-etal-2025-swan,\n    title = \"Swan and {A}rabic{MTEB}: Dialect-Aware, {A}rabic-Centric, Cross-Lingual, and Cross-Cultural Embedding Models and Benchmarks\",\n    author = \"Bhatia, Gagan  and\n      Nagoudi, El Moatez Billah  and\n      El Mekki, Abdellah  and\n      Alwajih, Fakhraddin  and\n      Abdul-Mageed, Muhammad\",\n    editor = \"Chiruzzo, Luis  and\n      Ritter, Alan  and\n      Wang, Lu\",\n    booktitle = \"Findings of the Association for Computational Linguistics: NAACL 2025\",\n    month = apr,\n    year = \"2025\",\n    address = \"Albuquerque, New Mexico\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2025.findings-naacl.263/\",\n    doi = \"10.18653/v1/2025.findings-naacl.263\",\n    pages = \"4654--4670\",\n    ISBN = \"979-8-89176-195-7\",\n    abstract = \"In this paper, we introduce Swan, a family of embedding models centred around the Arabic language, addressing both small-scale and large-scale use cases. Swan includes two variants: Swan-Small, based on ARBERTv2, and Swan-Large, built on ArMistral, a pretrained Arabic large language model. To evaluate these models, we propose ArabicMTEB, a comprehensive benchmark suite that assesses cross-lingual, multi-dialectal, multi-domain, and multi-cultural Arabic text embedding performance, covering eight diverse tasks and spanning 94 datasets. Swan-Large achieves state-of-the-art results, outperforming Multilingual-E5-large in most Arabic tasks, while the Swan-Small consistently surpasses Multilingual-E5-base. Our extensive evaluations demonstrate that Swan models are dialectally and culturally aware, excelling across various Arabic domains while offering significant monetary efficiency. This work significantly advances the field of Arabic language modelling and provides valuable resources for future research and applications in Arabic natural language processing. Our models and benchmarks will be made publicly accessible for research.\"\n}\n"
    },


    {
      "id": "casablanca-data-and-models-for-multidialectal-arabic-speech-recognition",
      "title": "Casablanca: Data and Models for Multidialectal Arabic Speech Recognition",
      "authors": "Bashar Talafha, Karima Kadaoui, Samar Mohamed Magdy, Mariem Habiboullah, Chafei Mohamed Chafei, Ahmed Oumar El-Shangiti, Hiba Zayed, Mohamedou Cheikh Tourad, Rahaf Alhamouri, Rwaa Assi, Aisha Alraeesi, Hour Mohamed, Fakhraddin Alwajih, Abdelrahman Mohamed, Abdellah El Mekki, El Moatez Billah Nagoudi, Benelhadj Djelloul Mama Saadia, Hamzah A. Alsayadi, Walid Al-Dhabyani, Sara Shatnawi, Yasir Ech-chammakhy, Amal Makouar, Yousra Berrachedi, Mustafa Jarrar, Shady Shehata, Ismail Berrada, Muhammad Abdul-Mageed",
      "venue": "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
      "date": "2024-11-04",
      "year": 2024,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#casablanca-data-and-models-for-multidialectal-arabic-speech-recognition",
      "paper_url": "https://aclanthology.org/2024.emnlp-main.1211.pdf",
      "abstract": "In spite of the recent progress in speech processing, the majority of world languages and dialects remain uncovered. This situation only furthers an already wide technological divide, thereby hindering technological and socioeconomic inclusion. This challenge is largely due to the absence of datasets that can empower diverse speech systems. In this paper, we seek to mitigate this obstacle for a number of Arabic dialects by presenting Casablanca, a large-scale community-driven effort to collect and transcribe a multi-dialectal Arabic dataset. The dataset covers eight dialects: Algerian, Egyptian, Emirati, Jordanian, Mauritanian, Moroccan, Palestinian, and Yemeni, and includes annotations for transcription, gender, dialect, and code-switching. We also develop a number of strong baselines exploiting Casablanca. The project page for Casablanca is accessible at: www.dlnlp.ai/speech/casablanca.",
      "bibtex": "@inproceedings{talafha-etal-2024-casablanca,\n    title = \"{C}asablanca: Data and Models for Multidialectal {A}rabic Speech Recognition\",\n    author = \"Talafha, Bashar  and\n      Kadaoui, Karima  and\n      Magdy, Samar Mohamed  and\n      Habiboullah, Mariem  and\n      Chafei, Chafei Mohamed  and\n      El-Shangiti, Ahmed Oumar  and\n      Zayed, Hiba  and\n      Tourad, Mohamedou Cheikh  and\n      Alhamouri, Rahaf  and\n      Assi, Rwaa  and\n      Alraeesi, Aisha  and\n      Mohamed, Hour  and\n      Alwajih, Fakhraddin  and\n      Mohamed, Abdelrahman  and\n      El Mekki, Abdellah  and\n      Nagoudi, El Moatez Billah  and\n      Saadia, Benelhadj Djelloul Mama  and\n      Alsayadi, Hamzah A.  and\n      Al-Dhabyani, Walid  and\n      Shatnawi, Sara  and\n      Ech-chammakhy, Yasir  and\n      Makouar, Amal  and\n      Berrachedi, Yousra  and\n      Jarrar, Mustafa  and\n      Shehata, Shady  and\n      Berrada, Ismail  and\n      Abdul-Mageed, Muhammad\",\n    editor = \"Al-Onaizan, Yaser  and\n      Bansal, Mohit  and\n      Chen, Yun-Nung\",\n    booktitle = \"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing\",\n    month = nov,\n    year = \"2024\",\n    address = \"Miami, Florida, USA\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2024.emnlp-main.1211/\",\n    doi = \"10.18653/v1/2024.emnlp-main.1211\",\n    pages = \"21745--21758\"\n}\n"
    },


    {
      "id": "promap-effective-bilingual-lexicon-induction-via-language-model-prompting",
      "title": "ProMap: Effective Bilingual Lexicon Induction via Language Model Prompting",
      "authors": "Abdellah El Mekki, Muhammad Abdul-Mageed, ElMoatez Billah Nagoudi, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
      "date": "2023-11-04",
      "year": 2023,
      "featured": true,
      "award": "Outstanding Paper Award",
      "url": "https://elmekki.me/articles/#promap-effective-bilingual-lexicon-induction-via-language-model-prompting",
      "paper_url": "https://arxiv.org/abs/2310.18778",
      "abstract": "Bilingual Lexicon Induction (BLI), where words are translated between two languages, is an important NLP task. While noticeable progress on BLI in rich resource languages using static word embeddings has been achieved. The word translation performance can be further improved by incorporating information from contextualized word embeddings. In this paper, we introduce ProMap, a novel approach for BLI that leverages the power of prompting pretrained multilingual and multidialectal language models to address these challenges. To overcome the employment of subword tokens in these models, ProMap relies on an effective padded prompting of language models with a seed dictionary that achieves good performance when used independently. We also demonstrate the effectiveness of ProMap in re-ranking results from other BLI methods such as with aligned static word embeddings. When evaluated on both rich-resource and low-resource languages, ProMap consistently achieves stateof-the-art results. Furthermore, ProMap enables strong performance in few-shot scenarios (even with less than 10 training examples), making it a valuable tool for low-resource language translation. Overall, we believe our method offers both exciting and promising direction for BLI in general and low-resource languages in particular.",
      "bibtex": "@inproceedings{el-mekki-etal-2023-promap,\n    title = \"{P}ro{M}ap: Effective Bilingual Lexicon Induction via Language Model Prompting\",\n    author = \"El Mekki, Abdellah  and\n      Abdul-Mageed, Muhammad  and\n      Nagoudi, ElMoatez Billah  and\n      Berrada, Ismail  and\n      Khoumsi, Ahmed\",\n    editor = \"Park, Jong C.  and\n      Arase, Yuki  and\n      Hu, Baotian  and\n      Lu, Wei  and\n      Wijaya, Derry  and\n      Purwarianti, Ayu  and\n      Krisnadhi, Adila Alfa\",\n    booktitle = \"Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)\",\n    month = nov,\n    year = \"2023\",\n    address = \"Nusa Dua, Bali\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2023.ijcnlp-main.39/\",\n    doi = \"10.18653/v1/2023.ijcnlp-main.39\",\n    pages = \"577--597\"\n}\n"
    },


    {
      "id": "fed-anids-federated-learning-for-anomaly-based-network-intrusion-detection-systems",
      "title": "Fed-ANIDS: Federated learning for anomaly-based network intrusion detection systems",
      "authors": "Meryem Janati Idrissi, Hamza Alami, Abdelkader El Mahdaouy, Abdellah El Mekki, Soufiane Oualil, Zakaria Yartaoui, Ismail Berrada",
      "venue": "Expert Systems with Applications",
      "date": "2023-08-30",
      "year": 2023,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#fed-anids-federated-learning-for-anomaly-based-network-intrusion-detection-systems",
      "paper_url": "https://www.sciencedirect.com/science/article/pii/S0957417423015026",
      "abstract": "As computer networks and interconnected systems continue to gain widespread adoption, ensuring cybersecurity has become a prominent concern for organizations, regardless of their scale or size. Meanwhile, centralized machine learning-based Anomaly Detection (AD) methods have shown promising results in improving the accuracy and efficiency of Network Intrusion Detection Systems (NIDS). However, new challenges arise such as privacy concerns and regulatory restrictions that must be tackled. Federated Learning (FL) has emerged as a solution that allows distributed clients to collaboratively train a shared model while preserving the privacy of their local data. In this paper, we propose Fed-ANIDS, a NIDS that leverages AD and FL to address the privacy concerns associated with centralized models. To detect intrusions, we compute an intrusion score based on the reconstruction error of normal traffic using various AD models, including simple autoencoders, variational autoencoders, and adversarial autoencoders. We thoroughly evaluate Fed-ANIDS using various settings and popular datasets, including USTC-TFC2016, CIC-IDS2017, and CSE-CIC-IDS2018. The proposed method demonstrates its effectiveness by achieving high performance in terms of different metrics while preserving the data privacy of distributed clients. Our findings highlight that autoencoder-based models outperform other generative adversarial network-based models, achieving high detection accuracy coupled with fewer false alarms. In addition, the FL framework (FedProx), which is a generalization and re-parametrization of the standard method for FL (FedAvg), achieves better results.",
      "bibtex": "@article{IDRISSI2023121000,\ntitle = {Fed-ANIDS: Federated learning for anomaly-based network intrusion detection systems},\njournal = {Expert Systems with Applications},\nvolume = {234},\npages = {121000},\nyear = {2023},\nissn = {0957-4174},\ndoi = {https://doi.org/10.1016/j.eswa.2023.121000},\nurl = {https://www.sciencedirect.com/science/article/pii/S0957417423015026},\nauthor = {Meryem Janati Idrissi and Hamza Alami and Abdelkader {El Mahdaouy} and Abdellah {El Mekki} and Soufiane Oualil and Zakaria Yartaoui and Ismail Berrada},\nkeywords = {Network security and privacy, Federated learning, Network intrusion detection, Anomaly detection, Autoencoders}}\n"
    },


    {
      "id": "omcd-offensive-moroccan-comments-dataset",
      "title": "OMCD: Offensive Moroccan Comments Dataset",
      "authors": "Kabil Essefar, Hassan Ait Baha, Abdelkader El Mahdaouy, Abdellah El Mekki, Ismail Berrada",
      "venue": "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)",
      "date": "2023-06-05",
      "year": 2023,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#omcd-offensive-moroccan-comments-dataset",
      "paper_url": "https://link.springer.com/article/10.1007/s10579-023-09663-2",
      "abstract": "Offensive content, such as verbal attacks, demeaning comments, or hate speech, has become widespread on social media. Automatic detection of this content is considered an important and challenging task. Although several research works have been proposed to address this challenge for high-resource languages, research on detecting offensive content in Dialectal Arabic (DA) remains under-explored. Recently, the detection of offensive language in DA has gained increasing interest among researchers in Natural Language Processing (NLP). However, only a limited number of annotated datasets have been introduced for single or multiple coarse-grained dialects. In this paper, we introduce Offensive Moroccan Comments Dataset (OMCD), the first dataset for offensive language detection for the Moroccan dialect. First, we present the data collection steps, the statistical analysis, and the annotation guidelines of the introduced dataset. Then, we evaluate several state-of-the-art Machine Learning (ML) and Deep Learning (DL) based models on the OMCD dataset. Finally, we highlight the impact of emojis on the evaluated models for offensive language detection.",
      "bibtex": "@article{essefar2023omcd,\n  title={OMCD: Offensive Moroccan comments dataset},\n  author={Essefar, Kabil and Ait Baha, Hassan and El Mahdaouy, Abdelkader and El Mekki, Abdellah and Berrada, Ismail},\n  journal={Language Resources and Evaluation},\n  volume={57},\n  number={4},\n  pages={1745--1765},\n  year={2023},\n  publisher={Springer}\n}\n"
    },


    {
      "id": "cs-um6p-at-semeval-2022-task-6-transformer-based-models-for-intended-sarcasm-detection-in-english-and-arabic",
      "title": "CS-UM6P at SemEval-2022 Task 6: Transformer-based Models for Intended Sarcasm Detection in English and Arabic",
      "authors": "Abdelkader El Mahdaouy, Abdellah El Mekki, Kabil Essefar, Abderrahman Skiredj, Ismail Berrada",
      "venue": "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)",
      "date": "2022-07-14",
      "year": 2022,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#cs-um6p-at-semeval-2022-task-6-transformer-based-models-for-intended-sarcasm-detection-in-english-and-arabic",
      "paper_url": "https://aclanthology.org/2022.semeval-1.117.pdf",
      "abstract": "Sarcasm is a form of figurative language where the intended meaning of a sentence differs from its literal meaning. This poses a serious challenge to several Natural Language Processing (NLP) applications such as Sentiment Analysis, Opinion Mining, and Author Profiling. In this paper, we present our participating system to the intended sarcasm detection task in English and Arabic languages. Our system consists of three deep learning-based models leveraging two existing pre-trained language models for Arabic and English. We have participated in all sub-tasks. Our official submissions achieve the best performance on sub-task A for Arabic language and rank second in sub-task B. For sub-task C, our system is ranked 7th and 11th on Arabic and English datasets, respectively.",
      "bibtex": "@inproceedings{el-mahdaouy-etal-2022-cs,\n    title = \"{CS}-{UM}6{P} at {S}em{E}val-2022 Task 6: Transformer-based Models for Intended Sarcasm Detection in {E}nglish and {A}rabic\",\n    author = \"El Mahdaouy, Abdelkader  and\n      El Mekki, Abdellah  and\n      Essefar, Kabil  and\n      Skiredj, Abderrahman  and\n      Berrada, Ismail\",\n    editor = \"Emerson, Guy  and\n      Schluter, Natalie  and\n      Stanovsky, Gabriel  and\n      Kumar, Ritesh  and\n      Palmer, Alexis  and\n      Schneider, Nathan  and\n      Singh, Siddharth  and\n      Ratan, Shyam\",\n    booktitle = \"Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)\",\n    month = jul,\n    year = \"2022\",\n    address = \"Seattle, United States\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2022.semeval-1.117/\",\n    doi = \"10.18653/v1/2022.semeval-1.117\",\n    pages = \"844--850\",\n    abstract = \"Sarcasm is a form of figurative language where the intended meaning of a sentence differs from its literal meaning. This poses a serious challenge to several Natural Language Processing (NLP) applications such as Sentiment Analysis, Opinion Mining, and Author Profiling. In this paper, we present our participating system to the intended sarcasm detection task in English and Arabic languages. Our system consists of three deep learning-based models leveraging two existing pre-trained language models for Arabic and English. We have participated in all sub-tasks. Our official submissions achieve the best performance on sub-task A for Arabic language and rank second in sub-task B. For sub-task C, our system is ranked 7th and 11th on Arabic and English datasets, respectively.\"\n}\n"
    },


    {
      "id": "um6p-cs-at-semeval-2022-task-11-enhancing-multilingual-and-code-mixed-complex-named-entity-recognition-via-pseudo-labels-using-multilingual-transformer",
      "title": "UM6P-CS at SemEval-2022 Task 11: Enhancing Multilingual and Code-Mixed Complex Named Entity Recognition via Pseudo Labels using Multilingual Transformer",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Mohammed Akallouch, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)",
      "date": "2022-07-14",
      "year": 2022,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#um6p-cs-at-semeval-2022-task-11-enhancing-multilingual-and-code-mixed-complex-named-entity-recognition-via-pseudo-labels-using-multilingual-transformer",
      "paper_url": "https://aclanthology.org/2022.semeval-1.207.pdf",
      "abstract": "Building real-world complex Named Entity Recognition (NER) systems is a challenging task. This is due to the complexity and ambiguity of named entities that appear in various contexts such as short input sentences, emerging entities, and complex entities. Besides, real-world queries are mostly malformed, as they can be code-mixed or multilingual, among other scenarios. In this paper, we introduce our submitted system to the Multilingual Complex Named Entity Recognition (MultiCoNER) shared task. We approach the complex NER for multilingual and code-mixed queries, by relying on the contextualized representation provided by the multilingual Transformer XLM-RoBERTa. In addition to the CRF-based token classification layer, we incorporate a span classification loss to recognize named entities spans. Furthermore, we use a self-training mechanism to generate weakly-annotated data from a large unlabeled dataset. Our proposed system is ranked 6th and 8th in the multilingual and code-mixed MultiCoNER’s tracks respectively.",
      "bibtex": "@inproceedings{el-mekki-etal-2022-um6p,\n    title = \"{UM}6{P}-{CS} at {S}em{E}val-2022 Task 11: Enhancing Multilingual and Code-Mixed Complex Named Entity Recognition via Pseudo Labels using Multilingual Transformer\",\n    author = \"El Mekki, Abdellah  and\n      El Mahdaouy, Abdelkader  and\n      Akallouch, Mohammed  and\n      Berrada, Ismail  and\n      Khoumsi, Ahmed\",\n    editor = \"Emerson, Guy  and\n      Schluter, Natalie  and\n      Stanovsky, Gabriel  and\n      Kumar, Ritesh  and\n      Palmer, Alexis  and\n      Schneider, Nathan  and\n      Singh, Siddharth  and\n      Ratan, Shyam\",\n    booktitle = \"Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)\",\n    month = jul,\n    year = \"2022\",\n    address = \"Seattle, United States\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2022.semeval-1.207/\",\n    doi = \"10.18653/v1/2022.semeval-1.207\",\n    pages = \"1511--1517\",\n    abstract = \"Building real-world complex Named Entity Recognition (NER) systems is a challenging task. This is due to the complexity and ambiguity of named entities that appear in various contexts such as short input sentences, emerging entities, and complex entities. Besides, real-world queries are mostly malformed, as they can be code-mixed or multilingual, among other scenarios. In this paper, we introduce our submitted system to the Multilingual Complex Named Entity Recognition (MultiCoNER) shared task. We approach the complex NER for multilingual and code-mixed queries, by relying on the contextualized representation provided by the multilingual Transformer XLM-RoBERTa. In addition to the CRF-based token classification layer, we incorporate a span classification loss to recognize named entities spans. Furthermore, we use a self-training mechanism to generate weakly-annotated data from a large unlabeled dataset. Our proposed system is ranked 6th and 8th in the multilingual and code-mixed MultiCoNER{'}s tracks respectively.\"\n}\n"
    },


    {
      "id": "adasl-an-unsupervised-domain-adaptation-framework-for-arabic-multi-dialectal-sequence-labeling",
      "title": "AdaSL: An Unsupervised Domain Adaptation framework for Arabic multi-dialectal Sequence Labeling",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Information Processing & Management",
      "date": "2022-07-01",
      "year": 2022,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#adasl-an-unsupervised-domain-adaptation-framework-for-arabic-multi-dialectal-sequence-labeling",
      "paper_url": "https://www.sciencedirect.com/science/article/pii/S0306457322000814",
      "abstract": "Dialectal Arabic (DA) refers to varieties of everyday spoken languages in the Arab world. These dialects differ according to the country and region of the speaker, and their textual content is constantly growing with the rise of social media networks and web blogs. Although research on Natural Language Processing (NLP) on standard Arabic, namely Modern Standard Arabic (MSA), has witnessed remarkable progress, research efforts on DA are rather limited. This is due to numerous challenges, such as the scarcity of labeled data as well as the nature and structure of DA. While some recent works have reached decent results on several DA sentence classification tasks, other complex tasks, such as sequence labeling, still suffer from weak performances when it comes to DA varieties with either a limited amount of labeled data or unlabeled data only. Besides, it has been shown that zero-shot transfer learning from models trained on MSA does not perform well on DA. In this paper, we introduce AdaSL, a new unsupervised domain adaptation framework for Arabic multi-dialectal sequence labeling, leveraging unlabeled DA data, labeled MSA data, and existing multilingual and Arabic Pre-trained Language Models (PLMs). The proposed framework relies on four key components: (1) domain adaptive fine-tuning of multilingual/MSA language models on unlabeled DA data, (2) sub-word embedding pooling, (3) iterative self-training on unlabeled DA data, and (4) iterative DA and MSA distribution alignment. We evaluate our framework on multi-dialectal Named Entity Recognition (NER) and Part-of-Speech (POS) tagging tasks. The overall results show that the zero-shot transfer learning, using our proposed framework, boosts the performance of the multilingual PLMs by 40.87% in macro-F1 score for the NER task, while it boosts the accuracy by 6.95% for the POS tagging task. For the Arabic PLMs, our proposed framework increases performance by 16.18% macro-F1 for the NER task and 2.22% accuracy for the POS tagging task, and thus, achieving new state-of-the-art zero-shot transfer learning performance for Arabic multi-dialectal sequence labeling.",
      "bibtex": "@article{ELMEKKI2022102964,\ntitle = {AdaSL: An Unsupervised Domain Adaptation framework for Arabic multi-dialectal Sequence Labeling},\njournal = {Information Processing & Management},\nvolume = {59},\nnumber = {4},\npages = {102964},\nyear = {2022},\nissn = {0306-4573},\ndoi = {https://doi.org/10.1016/j.ipm.2022.102964},\nurl = {https://www.sciencedirect.com/science/article/pii/S0306457322000814},\nauthor = {Abdellah {El Mekki} and Abdelkader {El Mahdaouy} and Ismail Berrada and Ahmed Khoumsi},\nkeywords = {Dialectal Arabic, Arabic natural language processing, Domain adaptation, Multi-dialectal sequence labeling, Named entity recognition, Part-of-speech tagging, Zero-shot transfer learning}\n}\n"
    },


    {
      "id": "deep-multi-task-models-for-misogyny-identification-and-categorization-on-arabic-social-media",
      "title": "Deep Multi-Task Models for Misogyny Identification and Categorization on Arabic Social Media",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Mohammed Akallouch, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Working Notes of FIRE 2021 - Forum for Information Retrieval Evaluation (FIRE-WN 2021), Gandhinagar, India",
      "date": "2021-12-13",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#deep-multi-task-models-for-misogyny-identification-and-categorization-on-arabic-social-media",
      "paper_url": "https://ceur-ws.org/Vol-3159/T5-5.pdf",
      "abstract": "The prevalence of toxic content on social media platforms, such as hate speech, offensive language, and misogyny, presents serious challenges to our interconnected society. These challenging issues have attracted widespread attention in Natural Language Processing (NLP) community. In this paper, we present the submitted systems to the first Arabic Misogyny Identification shared task. We investigate three multi-task learning models as well as their single-task counterparts. In order to encode the input text, our models rely on the pre-trained MARBERT language model. The overall obtained results show that all our submitted models have achieved the best performances (top three ranked submissions) in both misogyny identification and categorization tasks.",
      "bibtex": "@inproceedings{mahdaouy-2021-deep,\n  author    = {Abdelkader El Mahdaouy and\n               Abdellah El Mekki and\n               Ahmed Oumar and\n               Hajar Mousannif and\n               Ismail Berrada},\n  editor    = {Parth Mehta and\n               Thomas Mandl and\n               Prasenjit Majumder and\n               Mandar Mitra},\n  title     = {Deep Multi-Task Models for Misogyny Identification and Categorization\n               on Arabic Social Media},\n  booktitle = {Working Notes of {FIRE} 2021 - Forum for Information Retrieval Evaluation,\n               Gandhinagar, India, December 13-17, 2021},\n  series    = {{CEUR} Workshop Proceedings},\n  volume    = {3159},\n  pages     = {852--860},\n  publisher = {CEUR-WS.org},\n  year      = {2021},\n  url       = {https://ceur-ws.org/Vol-3159/T5-5.pdf},\n  timestamp = {Fri, 10 Mar 2023 16:22:30 +0100},\n  biburl    = {https://dblp.org/rec/conf/fire/MahdaouyMOMB21.bib},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n"
    },


    {
      "id": "cs-um6p-at-semeval-2021-task-1-a-deep-learning-model-based-pre-trained-transformer-encoder-for-lexical-complexity",
      "title": "CS-UM6P at SemEval-2021 Task 1: A Deep Learning Model-based Pre-trained Transformer Encoder for Lexical Complexity",
      "authors": "Nabil El Mamoun, Abdelkader El Mahdaouy, Abdellah El Mekki, Kabil Essefar, Ismail Berrada",
      "venue": "Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)",
      "date": "2021-08-05",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#cs-um6p-at-semeval-2021-task-1-a-deep-learning-model-based-pre-trained-transformer-encoder-for-lexical-complexity",
      "paper_url": "https://aclanthology.org/2021.semeval-1.73.pdf",
      "abstract": "Lexical Complexity Prediction (LCP) involves assigning a difficulty score to a particular word or expression, in a text intended for a target audience. In this paper, we introduce a new deep learning-based system for this challenging task. The proposed system consists of a deep learning model, based on pre-trained transformer encoder, for word and Multi-Word Expression (MWE) complexity prediction. First, on top of the encoder’s contextualized word embedding, our model employs an attention layer on the input context and the complex word or MWE. Then, the attention output is concatenated with the pooled output of the encoder and passed to a regression module. We investigate both single-task and joint training on both Sub-Tasks data using multiple pre-trained transformer-based encoders. The obtained results are very promising and show the effectiveness of fine-tuning pre-trained transformers for LCP task.",
      "bibtex": "@inproceedings{el-mamoun-etal-2021-cs,\n    title = \"{CS}-{UM}6{P} at {S}em{E}val-2021 Task 1: A Deep Learning Model-based Pre-trained Transformer Encoder for Lexical Complexity\",\n    author = \"El Mamoun, Nabil  and\n      El Mahdaouy, Abdelkader  and\n      El Mekki, Abdellah  and\n      Essefar, Kabil  and\n      Berrada, Ismail\",\n    editor = \"Palmer, Alexis  and\n      Schneider, Nathan  and\n      Schluter, Natalie  and\n      Emerson, Guy  and\n      Herbelot, Aurelie  and\n      Zhu, Xiaodan\",\n    booktitle = \"Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)\",\n    month = aug,\n    year = \"2021\",\n    address = \"Online\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2021.semeval-1.73/\",\n    doi = \"10.18653/v1/2021.semeval-1.73\",\n    pages = \"585--589\",\n    abstract = \"Lexical Complexity Prediction (LCP) involves assigning a difficulty score to a particular word or expression, in a text intended for a target audience. In this paper, we introduce a new deep learning-based system for this challenging task. The proposed system consists of a deep learning model, based on pre-trained transformer encoder, for word and Multi-Word Expression (MWE) complexity prediction. First, on top of the encoder{'}s contextualized word embedding, our model employs an attention layer on the input context and the complex word or MWE. Then, the attention output is concatenated with the pooled output of the encoder and passed to a regression module. We investigate both single-task and joint training on both Sub-Tasks data using multiple pre-trained transformer-based encoders. The obtained results are very promising and show the effectiveness of fine-tuning pre-trained transformers for LCP task.\"\n}\n"
    },


    {
      "id": "cs-um6p-at-semeval-2021-task-7-deep-multi-task-learning-model-for-detecting-and-rating-humor-and-offense",
      "title": "CS-UM6P at SemEval-2021 Task 7: Deep Multi-Task Learning Model for Detecting and Rating Humor and Offense",
      "authors": "Kabil Essefar, Abdellah El Mekki, Abdelkader El Mahdaouy, Nabil El Mamoun, Ismail Berrada",
      "venue": "Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)",
      "date": "2021-08-05",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#cs-um6p-at-semeval-2021-task-7-deep-multi-task-learning-model-for-detecting-and-rating-humor-and-offense",
      "paper_url": "https://aclanthology.org/2021.semeval-1.159.pdf",
      "abstract": "Humor detection has become a topic of interest for several research teams, especially those involved in socio-psychological studies, with the aim to detect the humor and the temper of a targeted population (e.g. a community, a city, a country, the employees of a given company). Most of the existing studies have formulated the humor detection problem as a binary classification task, whereas it revolves around learning the sense of humor by evaluating its different degrees. In this paper, we propose an end-to-end deep Multi-Task Learning (MTL) model to detect and rate humor and offense. It consists of a pre-trained transformer encoder and task-specific attention layers. The model is trained using MTL uncertainty loss weighting to adaptively combine all sub-tasks objective functions. Our MTL model tackles all sub-tasks of the SemEval-2021 Task-7 in one end-to-end deep learning system and shows very promising results.",
      "bibtex": "@inproceedings{essefar-etal-2021-cs,\ntitle = \"{CS}-{UM}6{P} at {S}em{E}val-2021 Task 7: Deep Multi-Task Learning Model for Detecting and Rating Humor and Offense\",\nauthor = \"Essefar, Kabil  and\nEl Mekki, Abdellah  and\nEl Mahdaouy, Abdelkader  and\nEl Mamoun, Nabil  and\nBerrada, Ismail\",\neditor = \"Palmer, Alexis  and\nSchneider, Nathan  and\nSchluter, Natalie  and\nEmerson, Guy  and\nHerbelot, Aurelie  and\nZhu, Xiaodan\",\nbooktitle = \"Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)\",\nmonth = aug,\nyear = \"2021\",\naddress = \"Online\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2021.semeval-1.159/\",\ndoi = \"10.18653/v1/2021.semeval-1.159\",\npages = \"1135--1140\",\nabstract = \"Humor detection has become a topic of interest for several research teams, especially those involved in socio-psychological studies, with the aim to detect the humor and the temper of a targeted population (e.g. a community, a city, a country, the employees of a given company). Most of the existing studies have formulated the humor detection problem as a binary classification task, whereas it revolves around learning the sense of humor by evaluating its different degrees. In this paper, we propose an end-to-end deep Multi-Task Learning (MTL) model to detect and rate humor and offense. It consists of a pre-trained transformer encoder and task-specific attention layers. The model is trained using MTL uncertainty loss weighting to adaptively combine all sub-tasks objective functions. Our MTL model tackles all sub-tasks of the SemEval-2021 Task-7 in one end-to-end deep learning system and shows very promising results.\"\n}\n"
    },


    {
      "id": "on-the-role-of-orthographic-variations-in-building-multidialectal-arabic-word-embeddings",
      "title": "On the Role of Orthographic Variations in Building Multidialectal Arabic Word Embeddings",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the Canadian Conference on Artificial Intelligence",
      "date": "2021-06-08",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#on-the-role-of-orthographic-variations-in-building-multidialectal-arabic-word-embeddings",
      "paper_url": "https://assets.pubpub.org/s5qybplo/11621610534420.pdf",
      "abstract": "Dialectal Arabic (DA) is mostly used by over 400 million people across Arab countries as a communication channel on social media platforms, web forums, and daily life. Building Natural Language Processing systems for each DA variant is a challenging issue due to the lack of data and the noisy nature of the available corpora. In this paper, we propose a method to incorporate orthographic features into word embedding mapping methods, inducing a multidialectal embedding space. Our method can be used for both supervised and unsupervised cross-lingual embedding mapping approaches. The core idea of our method is to project the orthographic features into a shared vector space using Canonical Correlation Analysis (CCA). Then, it extends word embedding vectors using the resulting features and learns the multidialectal mapping. The overall obtained results of our proposed method show that our method enhances Bilingual Lexicon Induction of DA by 3.33% and 17.50% compared to state-of-the-art supervised and unsupervised cross-lingual alignment methods, respectively.",
      "bibtex": "@inproceedings{El2021On,\nauthor = {El Mekki, Abdellah and El Mahdaouy, Abdelkader and Berrada, Ismail and Khoumsi, Ahmed},\nbooktitle = {Canadian {AI} 2021},\nyear = {2021},\nmonth = {jun 8},\nnote = {https://caiac.pubpub.org/pub/pdf9jqoh},\norganization = {Canadian Artificial Intelligence Association (CAIAC)},\ntitle = {On the {Role} of {Orthographic} {Variations} in {Building} {Multidialectal} {Arabic} {Word} {Embeddings}},\n}\n"
    },


    {
      "id": "domain-adaptation-for-arabic-cross-domain-and-cross-dialect-sentiment-analysis-from-contextualized-word-embedding",
      "title": "Domain Adaptation for Arabic Cross-Domain and Cross-Dialect Sentiment Analysis from Contextualized Word Embedding",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
      "date": "2021-06-06",
      "year": 2021,
      "featured": true,
      "award": "",
      "url": "https://elmekki.me/articles/#domain-adaptation-for-arabic-cross-domain-and-cross-dialect-sentiment-analysis-from-contextualized-word-embedding",
      "paper_url": "https://aclanthology.org/2021.naacl-main.226.pdf",
      "abstract": "Finetuning deep pre-trained language models has shown state-of-the-art performances on a wide range of Natural Language Processing (NLP) applications. Nevertheless, their generalization performance drops under domain shift. In the case of Arabic language, diglossia makes building and annotating corpora for each dialect and/or domain a more challenging task. Unsupervised Domain Adaptation tackles this issue by transferring the learned knowledge from labeled source domain data to unlabeled target domain data. In this paper, we propose a new unsupervised domain adaptation method for Arabic cross-domain and cross-dialect sentiment analysis from Contextualized Word Embedding. Several experiments are performed adopting the coarse-grained and the fine-grained taxonomies of Arabic dialects. The obtained results show that our method yields very promising results and outperforms several domain adaptation methods for most of the evaluated datasets. On average, our method increases the performance by an improvement rate of 20.8% over the zero-shot transfer learning from BERT.",
      "bibtex": "@inproceedings{el-mekki-etal-2021-domain,\ntitle = \"Domain Adaptation for {A}rabic Cross-Domain and Cross-Dialect Sentiment Analysis from Contextualized Word Embedding\",\nauthor = \"El Mekki, Abdellah  and\nEl Mahdaouy, Abdelkader  and\nBerrada, Ismail  and\nKhoumsi, Ahmed\",\neditor = \"Toutanova, Kristina  and\nRumshisky, Anna  and\nZettlemoyer, Luke  and\nHakkani-Tur, Dilek  and\nBeltagy, Iz  and\nBethard, Steven  and\nCotterell, Ryan  and\nChakraborty, Tanmoy  and\nZhou, Yichao\",\nbooktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\nmonth = jun,\nyear = \"2021\",\naddress = \"Online\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2021.naacl-main.226/\",\ndoi = \"10.18653/v1/2021.naacl-main.226\",\npages = \"2824--2837\",\nabstract = \"Finetuning deep pre-trained language models has shown state-of-the-art performances on a wide range of Natural Language Processing (NLP) applications. Nevertheless, their generalization performance drops under domain shift. In the case of Arabic language, diglossia makes building and annotating corpora for each dialect and/or domain a more challenging task. Unsupervised Domain Adaptation tackles this issue by transferring the learned knowledge from labeled source domain data to unlabeled target domain data. In this paper, we propose a new unsupervised domain adaptation method for Arabic cross-domain and cross-dialect sentiment analysis from Contextualized Word Embedding. Several experiments are performed adopting the coarse-grained and the fine-grained taxonomies of Arabic dialects. The obtained results show that our method yields very promising results and outperforms several domain adaptation methods for most of the evaluated datasets. On average, our method increases the performance by an improvement rate of 20.8{\\%} over the zero-shot transfer learning from BERT.\"\n}\n"
    },


    {
      "id": "deep-multi-task-model-for-sarcasm-detection-and-sentiment-analysis-in-arabic-language",
      "title": "Deep Multi-Task Model for Sarcasm Detection and Sentiment Analysis in Arabic Language",
      "authors": "Abdelkader El Mahdaouy, Abdellah El Mekki, Kabil Essefar, Nabil El Mamoun, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
      "date": "2021-04-19",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#deep-multi-task-model-for-sarcasm-detection-and-sentiment-analysis-in-arabic-language",
      "paper_url": "https://aclanthology.org/2021.wanlp-1.42.pdf",
      "abstract": "The prominence of figurative language devices, such as sarcasm and irony, poses serious challenges for Arabic Sentiment Analysis (SA). While previous research works tackle SA and sarcasm detection separately, this paper introduces an end-to-end deep Multi-Task Learning (MTL) model, allowing knowledge interaction between the two tasks. Our MTL model’s architecture consists of a Bidirectional Encoder Representation from Transformers (BERT) model, a multi-task attention interaction module, and two task classifiers. The overall obtained results show that our proposed model outperforms its single-task and MTL counterparts on both sarcasm and sentiment detection subtasks.",
      "bibtex": "@inproceedings{el-mahdaouy-etal-2021-deep,\ntitle = \"Deep Multi-Task Model for Sarcasm Detection and Sentiment Analysis in {A}rabic Language\",\nauthor = \"El Mahdaouy, Abdelkader  and\nEl Mekki, Abdellah  and\nEssefar, Kabil  and\nEl Mamoun, Nabil  and\nBerrada, Ismail  and\nKhoumsi, Ahmed\",\neditor = \"Habash, Nizar  and\nBouamor, Houda  and\nHajj, Hazem  and\nMagdy, Walid  and\nZaghouani, Wajdi  and\nBougares, Fethi  and\nTomeh, Nadi  and\nAbu Farha, Ibrahim  and\nTouileb, Samia\",\nbooktitle = \"Proceedings of the Sixth Arabic Natural Language Processing Workshop\",\nmonth = apr,\nyear = \"2021\",\naddress = \"Kyiv, Ukraine (Virtual)\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2021.wanlp-1.42/\",\npages = \"334--339\",\nabstract = \"The prominence of figurative language devices, such as sarcasm and irony, poses serious challenges for Arabic Sentiment Analysis (SA). While previous research works tackle SA and sarcasm detection separately, this paper introduces an end-to-end deep Multi-Task Learning (MTL) model, allowing knowledge interaction between the two tasks. Our MTL model{'}s architecture consists of a Bidirectional Encoder Representation from Transformers (BERT) model, a multi-task attention interaction module, and two task classifiers. The overall obtained results show that our proposed model outperforms its single-task and MTL counterparts on both sarcasm and sentiment detection subtasks.\"\n}\n"
    },


    {
      "id": "bert-based-multi-task-model-for-country-and-province-level-msa-and-dialectal-arabic-identification",
      "title": "BERT-based multi-task model for country and province level MSA and dialectal Arabic identification",
      "authors": "Abdellah El Mekki, Abdelkader El Mahdaouy, Kabil Essefar, Nabil El Mamoun, Ismail Berrada, Ahmed Khoumsi",
      "venue": "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
      "date": "2021-04-12",
      "year": 2021,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#bert-based-multi-task-model-for-country-and-province-level-msa-and-dialectal-arabic-identification",
      "paper_url": "https://aclanthology.org/2021.wanlp-1.31.pdf",
      "abstract": "Dialect and standard language identification are crucial tasks for many Arabic natural language processing applications. In this paper, we present our deep learning-based system, submitted to the second NADI shared task for country-level and province-level identification of Modern Standard Arabic (MSA) and Dialectal Arabic (DA). The system is based on an end-to-end deep Multi-Task Learning (MTL) model to tackle both country-level and province-level MSA/DA identification. The latter MTL model consists of a shared Bidirectional Encoder Representation Transformers (BERT) encoder, two task-specific attention layers, and two classifiers. Our key idea is to leverage both the task-discriminative and the inter-task shared features for country and province MSA/DA identification. The obtained results show that our MTL model outperforms single-task models on most subtasks.",
      "bibtex": "@inproceedings{el-mekki-etal-2021-bert,\ntitle = \"{BERT}-based Multi-Task Model for Country and Province Level {MSA} and Dialectal {A}rabic Identification\",\nauthor = \"El Mekki, Abdellah  and\nEl Mahdaouy, Abdelkader  and\nEssefar, Kabil  and\nEl Mamoun, Nabil  and\nBerrada, Ismail  and\nKhoumsi, Ahmed\",\neditor = \"Habash, Nizar  and\nBouamor, Houda  and\nHajj, Hazem  and\nMagdy, Walid  and\nZaghouani, Wajdi  and\nBougares, Fethi  and\nTomeh, Nadi  and\nAbu Farha, Ibrahim  and\nTouileb, Samia\",\nbooktitle = \"Proceedings of the Sixth Arabic Natural Language Processing Workshop\",\nmonth = apr,\nyear = \"2021\",\naddress = \"Kyiv, Ukraine (Virtual)\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2021.wanlp-1.31/\",\npages = \"271--275\",\nabstract = \"Dialect and standard language identification are crucial tasks for many Arabic natural language processing applications. In this paper, we present our deep learning-based system, submitted to the second NADI shared task for country-level and province-level identification of Modern Standard Arabic (MSA) and Dialectal Arabic (DA). The system is based on an end-to-end deep Multi-Task Learning (MTL) model to tackle both country-level and province-level MSA/DA identification. The latter MTL model consists of a shared Bidirectional Encoder Representation Transformers (BERT) encoder, two task-specific attention layers, and two classifiers. Our key idea is to leverage both the task-discriminative and the inter-task shared features for country and province MSA/DA identification. The obtained results show that our MTL model outperforms single-task models on most subtasks.\"\n}\n"
    },


    {
      "id": "weighted-combination-of-bert-and-n-gram-features-for-nuanced-arabic-dialect-identification",
      "title": "Weighted combination of BERT and N-GRAM features for Nuanced Arabic Dialect Identification",
      "authors": "Abdellah El Mekki, Ahmed Alami, Hamza Alami, Ahmed Khoumsi, Ismail Berrada",
      "venue": "Proceedings of the Fifth Arabic Natural Language Processing Workshop",
      "date": "2020-12-12",
      "year": 2020,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#weighted-combination-of-bert-and-n-gram-features-for-nuanced-arabic-dialect-identification",
      "paper_url": "https://aclanthology.org/2020.wanlp-1.27.pdf",
      "abstract": "Around the Arab world, different Arabic dialects are spoken by more than 300M persons, and are increasingly popular in social media texts. However, Arabic dialects are considered to be low-resource languages, limiting the development of machine-learning based systems for these dialects. In this paper, we investigate the Arabic dialect identification task, from two perspectives: country-level dialect identification from 21 Arab countries, and province-level dialect identification from 100 provinces. We introduce an unified pipeline of state-of-the-art models, that can handle the two subtasks. Our experimental studies applied to the NADI shared task, show promising results both at the country-level (F1-score of 25.99%) and the province-level (F1-score of 6.39%), and thus allow us to be ranked 2nd for the country-level subtask, and 1st in the province-level subtask.",
      "bibtex": "@inproceedings{el-mekki-etal-2020-weighted,\ntitle = \"Weighted combination of {BERT} and N-{GRAM} features for Nuanced {A}rabic Dialect Identification\",\nauthor = \"El Mekki, Abdellah  and\nAlami, Ahmed  and\nAlami, Hamza  and\nKhoumsi, Ahmed  and\nBerrada, Ismail\",\neditor = \"Zitouni, Imed  and\nAbdul-Mageed, Muhammad  and\nBouamor, Houda  and\nBougares, Fethi  and\nEl-Haj, Mahmoud  and\nTomeh, Nadi  and\nZaghouani, Wajdi\",\nbooktitle = \"Proceedings of the Fifth Arabic Natural Language Processing Workshop\",\nmonth = dec,\nyear = \"2020\",\naddress = \"Barcelona, Spain (Online)\",\npublisher = \"Association for Computational Linguistics\",\nurl = \"https://aclanthology.org/2020.wanlp-1.27/\",\npages = \"268--274\",\nabstract = \"Around the Arab world, different Arabic dialects are spoken by more than 300M persons, and are increasingly popular in social media texts. However, Arabic dialects are considered to be low-resource languages, limiting the development of machine-learning based systems for these dialects. In this paper, we investigate the Arabic dialect identification task, from two perspectives: country-level dialect identification from 21 Arab countries, and province-level dialect identification from 100 provinces. We introduce an unified pipeline of state-of-the-art models, that can handle the two subtasks. Our experimental studies applied to the NADI shared task, show promising results both at the country-level (F1-score of 25.99{\\%}) and the province-level (F1-score of 6.39{\\%}), and thus allow us to be ranked 2nd for the country-level subtask, and 1st in the province-level subtask.\"\n}\n"
    },


    {
      "id": "improving-driver-identification-for-the-next-generation-of-in-vehicle-software-systems",
      "title": "Improving driver identification for the next-generation of in-vehicle software systems",
      "authors": "Abdellah El Mekki, Afaf Bouhoute, Ismail Berrada",
      "venue": "IEEE Transactions on Vehicular Technology",
      "date": "2019-06-23",
      "year": 2019,
      "featured": false,
      "award": "",
      "url": "https://elmekki.me/articles/#improving-driver-identification-for-the-next-generation-of-in-vehicle-software-systems",
      "paper_url": "https://ieeexplore.ieee.org/abstract/document/8746156",
      "abstract": "This paper deals with driver identification and fingerprinting and its application for enhanced driver profiling and car security in connected cars. We introduce a new driver identification model based on collected data from smartphone sensors, and/or the OBD-II protocol, using convolutional neural networks, and recurrent neural networks (long short-term memory) RNN/LSTM. Unlike the existing works, we use a cross-validation technique that provides reproducible results when applied on unseen realistic data. We also studied the robustness of the model to sensor data anomalies. The obtained results show that our model accuracy remains acceptable even when the rate of the anomalies increases substantially. Finally, the proposed model was tested on different datasets and implemented in Automotive Grade Linux Framework, as a real-time anti-theft and the driver profiling system.",
      "bibtex": "@ARTICLE{8746156,\nauthor={Mekki, Abdellah El and Bouhoute, Afaf and Berrada, Ismail},\njournal={IEEE Transactions on Vehicular Technology}, \ntitle={Improving Driver Identification for the Next-Generation of In-Vehicle Software Systems}, \nyear={2019},\nvolume={68},\nnumber={8},\npages={7406-7415},\nkeywords={Automobiles;Sensors;Data models;Global Positioning System;Automotive engineering;Connected vehicles;Driver identification;time series;neural networks;CNN;RNN/LSTM;Automotive Grade Linux;anomaly detection},\ndoi={10.1109/TVT.2019.2924906}}\n"
    }

  ]
}