Commit before erasing old wordlist code
This commit is contained in:
parent
6ebe8cd489
commit
3ac59f35ed
@ -1 +1,122 @@
|
|||||||
word,lemmatized_word
|
word,lemmatized_word
|
||||||
|
the,THE
|
||||||
|
of,OF
|
||||||
|
to,TO
|
||||||
|
in,IN
|
||||||
|
is,BE
|
||||||
|
that,THAT
|
||||||
|
for,FOR
|
||||||
|
be,BE
|
||||||
|
by,BY
|
||||||
|
with,WITH
|
||||||
|
on,ON
|
||||||
|
not,NOT
|
||||||
|
this,THIS
|
||||||
|
are,BE
|
||||||
|
at,AT
|
||||||
|
from,FROM
|
||||||
|
he,HE
|
||||||
|
which,WHICH
|
||||||
|
his,HIS
|
||||||
|
have,HAVE
|
||||||
|
an,AN
|
||||||
|
but,BUT
|
||||||
|
you,YOU
|
||||||
|
they,THEY
|
||||||
|
were,BE
|
||||||
|
had,HAVE
|
||||||
|
we,WE
|
||||||
|
all,ALL
|
||||||
|
one,ONE
|
||||||
|
their,THEIR
|
||||||
|
been,BE
|
||||||
|
will,WILL
|
||||||
|
there,THERE
|
||||||
|
can,CAN
|
||||||
|
if,IF
|
||||||
|
other,OTHER
|
||||||
|
would,WOULD
|
||||||
|
no,NO
|
||||||
|
her,SHE
|
||||||
|
may,MAY
|
||||||
|
more,MORE
|
||||||
|
when,WHEN
|
||||||
|
who,WHO
|
||||||
|
such,SUCH
|
||||||
|
these,THESE
|
||||||
|
any,ANY
|
||||||
|
she,SHE
|
||||||
|
new,NEW
|
||||||
|
time,TIME
|
||||||
|
than,THAN
|
||||||
|
do,DO
|
||||||
|
some,SOME
|
||||||
|
what,WHAT
|
||||||
|
only,ONLY
|
||||||
|
into,INTO
|
||||||
|
them,THEY
|
||||||
|
two,TWO
|
||||||
|
also,ALSO
|
||||||
|
about,ABOUT
|
||||||
|
out,OUT
|
||||||
|
him,HE
|
||||||
|
my,MY
|
||||||
|
said,SAY
|
||||||
|
up,UP
|
||||||
|
our,OUR
|
||||||
|
first,FIRST
|
||||||
|
should,SHOULD
|
||||||
|
under,UNDER
|
||||||
|
made,MAKE
|
||||||
|
state,STATE
|
||||||
|
see,SEE
|
||||||
|
after,AFTER
|
||||||
|
could,COULD
|
||||||
|
then,THEN
|
||||||
|
me,I
|
||||||
|
most,MOST
|
||||||
|
over,OVER
|
||||||
|
very,VERY
|
||||||
|
your,YOUR
|
||||||
|
between,BETWEEN
|
||||||
|
where,WHERE
|
||||||
|
now,NOW
|
||||||
|
shall,SHALL
|
||||||
|
work,WORK
|
||||||
|
those,THOSE
|
||||||
|
same,SAME
|
||||||
|
well,WELL
|
||||||
|
each,EACH
|
||||||
|
many,MANY
|
||||||
|
being,BE
|
||||||
|
years,YEAR
|
||||||
|
did,DO
|
||||||
|
year,YEAR
|
||||||
|
through,THROUGH
|
||||||
|
must,MUST
|
||||||
|
upon,UPON
|
||||||
|
before,BEFORE
|
||||||
|
like,LIKE
|
||||||
|
use,USE
|
||||||
|
part,PART
|
||||||
|
general,GENERAL
|
||||||
|
people,PEOPLE
|
||||||
|
because,BECAUSE
|
||||||
|
used,USE
|
||||||
|
how,HOW
|
||||||
|
even,EVEN
|
||||||
|
much,MUCH
|
||||||
|
states,STATE
|
||||||
|
during,DURING
|
||||||
|
both,BOTH
|
||||||
|
case,CASE
|
||||||
|
three,THREE
|
||||||
|
number,NUMBER
|
||||||
|
make,MAKE
|
||||||
|
per,PER
|
||||||
|
great,GREAT
|
||||||
|
act,ACT
|
||||||
|
way,WAY
|
||||||
|
life,LIFE
|
||||||
|
good,GOOD
|
||||||
|
day,DAY
|
||||||
|
|
@ -86,11 +86,18 @@ for m in list(sum(custom_maps, ())):
|
|||||||
print("Step 5")
|
print("Step 5")
|
||||||
|
|
||||||
# Lemmatize all words (plural -> singular)
|
# Lemmatize all words (plural -> singular)
|
||||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||||
|
|
||||||
with open("01-lemmatized-words.csv", "w") as f:
|
with open("01-lemmatized-words.csv", "w") as f:
|
||||||
f.write("word,lemmatized_word\n")
|
f.write("word,lemmatized_word\n")
|
||||||
|
|
||||||
for w in lemmatize_mappings:
|
iter = tqdm(all_words[:1000])
|
||||||
f.write(f"{w[0]},{w[1]}")
|
|
||||||
f.write("\n")
|
for w in iter:
|
||||||
|
lemmatized_word = nlp(w)[0].lemma_.upper()
|
||||||
|
if lemmatized_word == w:
|
||||||
|
continue
|
||||||
|
if lemmatized_word not in all_words:
|
||||||
|
iter.write(f"{lemmatized_word} not in all_words")
|
||||||
|
|
||||||
|
f.write(f"{w},{lemmatized_word}\n")
|
||||||
|
3090
wordlist/02-exclude.txt
Normal file
3090
wordlist/02-exclude.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user