this_algorithm/docs/wordlist-new2.ipynb

113 lines
6.2 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "991a711f-be98-4aae-a657-84b065449916",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
"Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
"Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
"Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
"Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
"Collecting en-core-web-trf==3.5.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
"\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
]
}
],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" !pip install spacy\n",
" !python -m spacy download en_core_web_trf\n",
" import spacy\n",
" \n",
" spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
" \n",
" _initialized=True\n",
" \n",
"import pandas as pd\n",
"import gzip\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
"nlp = spacy.load('en', disable=['parser', 'ner'])\n",
"\n",
"sentence = \"The striped bats are hanging on their feet for best\"\n",
"\n",
"# Parse the sentence using the loaded 'en' model object `nlp`\n",
"doc = nlp(sentence)\n",
"\n",
"# Extract the lemma for each token and join\n",
"\" \".join([token.lemma_ for token in doc])\n",
"#> 'the strip bat be hang on -PRON- foot for good'"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}