Commit before erasing old wordlist code
This commit is contained in:
parent
6ebe8cd489
commit
3ac59f35ed
@ -1 +1,122 @@
|
||||
word,lemmatized_word
|
||||
the,THE
|
||||
of,OF
|
||||
to,TO
|
||||
in,IN
|
||||
is,BE
|
||||
that,THAT
|
||||
for,FOR
|
||||
be,BE
|
||||
by,BY
|
||||
with,WITH
|
||||
on,ON
|
||||
not,NOT
|
||||
this,THIS
|
||||
are,BE
|
||||
at,AT
|
||||
from,FROM
|
||||
he,HE
|
||||
which,WHICH
|
||||
his,HIS
|
||||
have,HAVE
|
||||
an,AN
|
||||
but,BUT
|
||||
you,YOU
|
||||
they,THEY
|
||||
were,BE
|
||||
had,HAVE
|
||||
we,WE
|
||||
all,ALL
|
||||
one,ONE
|
||||
their,THEIR
|
||||
been,BE
|
||||
will,WILL
|
||||
there,THERE
|
||||
can,CAN
|
||||
if,IF
|
||||
other,OTHER
|
||||
would,WOULD
|
||||
no,NO
|
||||
her,SHE
|
||||
may,MAY
|
||||
more,MORE
|
||||
when,WHEN
|
||||
who,WHO
|
||||
such,SUCH
|
||||
these,THESE
|
||||
any,ANY
|
||||
she,SHE
|
||||
new,NEW
|
||||
time,TIME
|
||||
than,THAN
|
||||
do,DO
|
||||
some,SOME
|
||||
what,WHAT
|
||||
only,ONLY
|
||||
into,INTO
|
||||
them,THEY
|
||||
two,TWO
|
||||
also,ALSO
|
||||
about,ABOUT
|
||||
out,OUT
|
||||
him,HE
|
||||
my,MY
|
||||
said,SAY
|
||||
up,UP
|
||||
our,OUR
|
||||
first,FIRST
|
||||
should,SHOULD
|
||||
under,UNDER
|
||||
made,MAKE
|
||||
state,STATE
|
||||
see,SEE
|
||||
after,AFTER
|
||||
could,COULD
|
||||
then,THEN
|
||||
me,I
|
||||
most,MOST
|
||||
over,OVER
|
||||
very,VERY
|
||||
your,YOUR
|
||||
between,BETWEEN
|
||||
where,WHERE
|
||||
now,NOW
|
||||
shall,SHALL
|
||||
work,WORK
|
||||
those,THOSE
|
||||
same,SAME
|
||||
well,WELL
|
||||
each,EACH
|
||||
many,MANY
|
||||
being,BE
|
||||
years,YEAR
|
||||
did,DO
|
||||
year,YEAR
|
||||
through,THROUGH
|
||||
must,MUST
|
||||
upon,UPON
|
||||
before,BEFORE
|
||||
like,LIKE
|
||||
use,USE
|
||||
part,PART
|
||||
general,GENERAL
|
||||
people,PEOPLE
|
||||
because,BECAUSE
|
||||
used,USE
|
||||
how,HOW
|
||||
even,EVEN
|
||||
much,MUCH
|
||||
states,STATE
|
||||
during,DURING
|
||||
both,BOTH
|
||||
case,CASE
|
||||
three,THREE
|
||||
number,NUMBER
|
||||
make,MAKE
|
||||
per,PER
|
||||
great,GREAT
|
||||
act,ACT
|
||||
way,WAY
|
||||
life,LIFE
|
||||
good,GOOD
|
||||
day,DAY
|
||||
|
|
@ -86,11 +86,18 @@ for m in list(sum(custom_maps, ())):
|
||||
print("Step 5")
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
|
||||
with open("01-lemmatized-words.csv", "w") as f:
|
||||
f.write("word,lemmatized_word\n")
|
||||
|
||||
for w in lemmatize_mappings:
|
||||
f.write(f"{w[0]},{w[1]}")
|
||||
f.write("\n")
|
||||
iter = tqdm(all_words[:1000])
|
||||
|
||||
for w in iter:
|
||||
lemmatized_word = nlp(w)[0].lemma_.upper()
|
||||
if lemmatized_word == w:
|
||||
continue
|
||||
if lemmatized_word not in all_words:
|
||||
iter.write(f"{lemmatized_word} not in all_words")
|
||||
|
||||
f.write(f"{w},{lemmatized_word}\n")
|
||||
|
3090
wordlist/02-exclude.txt
Normal file
3090
wordlist/02-exclude.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user