{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "991a711f-be98-4aae-a657-84b065449916", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n", "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n", "Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n", "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n", "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n", "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n", "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n", "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n", "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n", "Collecting en-core-web-trf==3.5.0\n", " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n", "\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m" ] } ], "source": [ "try:\n", " _initialized\n", "except:\n", " !pip install spacy\n", " !python -m spacy download en_core_web_trf\n", " import spacy\n", " \n", " spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n", " \n", " _initialized=True\n", " \n", "import pandas as pd\n", "import gzip\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "id": "6b93818f-c54a-4c88-9968-df4244b7c6f6", "metadata": {}, "outputs": [], "source": [ "import spacy\n", "\n", "# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n", "nlp = spacy.load('en', disable=['parser', 'ner'])\n", "\n", "sentence = \"The striped bats are hanging on their feet for best\"\n", "\n", "# Parse the sentence using the loaded 'en' model object `nlp`\n", "doc = nlp(sentence)\n", "\n", "# Extract the lemma for each token and join\n", "\" \".join([token.lemma_ for token in doc])\n", "#> 'the strip bat be hang on -PRON- foot for good'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }