Use new wordlist generation logic

This commit is contained in:
Austen Adler 2023-03-02 00:20:25 -05:00
parent 3ac59f35ed
commit 71404b8c6e
15 changed files with 11114 additions and 3241 deletions

Binary file not shown.

View File

@ -0,0 +1,501 @@
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
THEMSELVES,THEMSELVE,SPACY
PERHAPS,PERHAP,SPACY
SERIES,SERIE,SPACY
OURSELVES,OURSELVE,SPACY
EXCEED,EXCEE,SPACY
BLEED,BLEE,SPACY
MATHEMATICS,MATHEMATIC,SPACY
NAKED,NAKE,SPACY
SKILLED,SKILLE,SPACY
BELOVED,BELOVE,SPACY
LEST,L,SPACY
WICKED,WICKE,SPACY
EMBED,EMBE,SPACY
DIABETES,DIABETE,SPACY
ONGOING,ONGOE,SPACY
ASHAMED,ASHAME,SPACY
CREED,CREE,SPACY
SINNER,SINN,SPACY
INDEBTED,INDEBTE,SPACY
UNCHANGED,UNCHANGE,SPACY
UNPUBLISHED,UNPUBLISHE,SPACY
UNEMPLOYED,UNEMPLOYE,SPACY
FORTHCOMING,FORTHCOME,SPACY
METAPHYSICS,METAPHYSIC,SPACY
TROUSERS,TROUSER,SPACY
UNAFFECTED,UNAFFECTE,SPACY
RENOWNED,RENOWNE,SPACY
TALENTED,TALENTE,SPACY
GREED,GREE,SPACY
UNFINISHED,UNFINISHE,SPACY
AESTHETICS,AESTHETIC,SPACY
INFRARED,INFRARE,SPACY
DISINTERESTED,DISINTERESTE,SPACY
UNNOTICED,UNNOTICE,SPACY
TING,TE,SPACY
ANNALS,ANNAL,SPACY
OUTSKIRTS,OUTSKIRT,SPACY
DETEST,DET,SPACY
FETTER,FETT,SPACY
SIDEWAYS,SIDEWAY,SPACY
ALMS,ALM,SPACY
MEASLES,MEASLE,SPACY
UNRESTRICTED,UNRESTRICTE,SPACY
ARREARS,ARREAR,SPACY
UNDEVELOPED,UNDEVELOPE,SPACY
CARIES,CARIE,SPACY
MORES,MORE,SPACY
UNALTERED,UNALTERE,SPACY
UNPROTECTED,UNPROTECTE,SPACY
UNEDUCATED,UNEDUCATE,SPACY
GALLOWS,GALLOW,SPACY
UNDATED,UNDATE,SPACY
UNNAMED,UNNAME,SPACY
MONIES,MONIE,SPACY
UNAIDED,UNAIDE,SPACY
UNQUESTIONED,UNQUESTIONE,SPACY
IMBED,IMBE,SPACY
AMINES,AMINE,SPACY
GRASSROOTS,GRASSROOT,SPACY
ACCURSED,ACCURSE,SPACY
UNDECIDED,UNDECIDE,SPACY
UNCHECKED,UNCHECKE,SPACY
UNCOMPLICATED,UNCOMPLICATE,SPACY
BELATED,BELATE,SPACY
UNDETERMINED,UNDETERMINE,SPACY
EAVES,EAVE,SPACY
DISAFFECTED,DISAFFECT,SPACY
UNFULFILLED,UNFULFILLE,SPACY
STRIATED,STRIATE,SPACY
RICKETS,RICKET,SPACY
BICEPS,BICEP,SPACY
DEATHBED,DEATHBE,SPACY
RABIES,RABIE,SPACY
UNATTENDED,UNATTENDE,SPACY
UNABATED,UNABATE,SPACY
MANNERED,MANNERE,SPACY
FAECES,FAECE,SPACY
QUADRUPED,QUADRUPE,SPACY
UNSTRUCTURED,UNSTRUCTURE,SPACY
UNAVAILING,UNAVAILE,SPACY
SUBSPECIES,SUBSPECIE,SPACY
UNDETECTED,UNDETECTE,SPACY
UNPLANNED,UNPLANNE,SPACY
UNCONDITIONED,UNCONDITIONE,SPACY
INBRED,INBREED,SPACY
TONGS,TONG,SPACY
BIGOTED,BIGOTE,SPACY
ENTRAILS,ENTRAIL,SPACY
UNEQUALLED,UNEQUALLE,SPACY
ONCOMING,ONCOME,SPACY
UNTHINKING,UNTHINKE,SPACY
MENSES,MENSE,SPACY
UNMITIGATED,UNMITIGATE,SPACY
UNRECORDED,UNRECORDE,SPACY
SEMIOTICS,SEMIOTIC,SPACY
UNACKNOWLEDGED,UNACKNOWLEDGE,SPACY
UNDISGUISED,UNDISGUISE,SPACY
PYRITES,PYRITE,SPACY
UNOBSTRUCTED,UNOBSTRUCTE,SPACY
UNATTACHED,UNATTACHE,SPACY
UNMATCHED,UNMATCHE,SPACY
PLIERS,PLIER,SPACY
ENAMORED,ENAMOR,SPACY
PANTIES,PANTIE,SPACY
BACKWOODS,BACKWOOD,SPACY
UNPROVOKED,UNPROVOKE,SPACY
TRICEPS,TRICEP,SPACY
UNCHARTED,UNCHARTE,SPACY
MALNOURISHED,MALNOURISH,SPACY
MONEYED,MONEYE,SPACY
UNTAMED,UNTAME,SPACY
METHYLATED,METHYLATE,SPACY
UNRELIEVED,UNRELIEVE,SPACY
UNLETTERED,UNLETTERE,SPACY
HOTBED,HOTBE,SPACY
UNIMPROVED,UNIMPROVE,SPACY
LOPSIDED,LOPSIDE,SPACY
LEGGING,LEGGE,SPACY
UNFEIGNED,UNFEIGNE,SPACY
UNTESTED,UNTESTE,SPACY
UNBLEMISHED,UNBLEMISHE,SPACY
TECHNICS,TECHNIC,SPACY
UNTAINTED,UNTAINTE,SPACY
UNREGISTERED,UNREGISTERE,SPACY
UNFORMED,UNFORME,SPACY
OVERWEENING,OVERWEENE,SPACY
UNPROVIDED,UNPROVIDE,SPACY
FOOTLIGHTS,FOOTLIGHT,SPACY
UNCONVERTED,UNCONVERTE,SPACY
OBSEQUIES,OBSEQUIE,SPACY
PINCERS,PINCER,SPACY
MALADJUSTED,MALADJUSTE,SPACY
ISOSCELES,ISOSCELE,SPACY
UNPROVED,UNPROVE,SPACY
AMIDSHIPS,AMIDSHIP,SPACY
SEMISKILLED,SEMISKILLE,SPACY
UNDIRECTED,UNDIRECTE,SPACY
TABES,TABE,SPACY
UNCLAIMED,UNCLAIME,SPACY
UNPOLISHED,UNPOLISHE,SPACY
FARSIGHTED,FARSIGHTE,SPACY
FAUCES,FAUCE,SPACY
NONCOMMISSIONED,NONCOMMISSIONE,SPACY
UNCHARGED,UNCHARGE,SPACY
CONGERIES,CONGERIE,SPACY
SCABIES,SCABIE,SPACY
MALFORMED,MALFORME,SPACY
INFORMATICS,INFORMATIC,SPACY
INCREMENTED,INCREMENTE,SPACY
UNDISTRIBUTED,UNDISTRIBUTE,SPACY
HYDRODYNAMICS,HYDRODYNAMIC,SPACY
ANTIPODES,ANTIPODE,SPACY
UNDEREMPLOYED,UNDEREMPLOYE,SPACY
BIPED,BIPE,SPACY
ELECTRODYNAMICS,ELECTRODYNAMIC,SPACY
FEEBLEMINDED,FEEBLEMINDE,SPACY
SUDS,SUD,SPACY
UNDERSIZED,UNDERSIZE,SPACY
HUSTINGS,HUSTING,SPACY
STOREYED,STOREYE,SPACY
UNREFINED,UNREFINE,SPACY
UNTURNED,UNTURNE,SPACY
SERRIED,SERRIE,SPACY
DOLDRUMS,DOLDRUM,SPACY
STATS,STAT,SPACY
GULES,GULE,SPACY
UNDERPANTS,UNDERPANT,SPACY
UNREWARDING,UNREWARDE,SPACY
CALIPERS,CALIPER,SPACY
ONESIDED,ONESIDE,SPACY
UNABRIDGED,UNABRIDGE,SPACY
UNFURNISHED,UNFURNISHE,SPACY
UNREDEEMED,UNREDEEME,SPACY
UNSEEING,UNSEEE,SPACY
KNICKERS,KNICKER,SPACY
UNLISTED,UNLISTE,SPACY
INNARDS,INNARD,SPACY
GLANDERS,GLANDER,SPACY
OVEREXTENDED,OVEREXTEND,SPACY
RELATIVIZED,RELATIVIZE,SPACY
UNFUNDED,UNFUNDE,SPACY
COLUMNED,COLUMNE,SPACY
CALISTHENICS,CALISTHENIC,SPACY
SUPERFICIES,SUPERFICIE,SPACY
CASTELLATED,CASTELLATE,SPACY
PUREBRED,PUREBRE,SPACY
CORTES,CORTE,SPACY
REDHEADED,REDHEADE,SPACY
CASTANETS,CASTANET,SPACY
UNPRACTISED,UNPRACTISE,SPACY
UNEDITED,UNEDITE,SPACY
LIVERIED,LIVERIE,SPACY
NOSEBLEED,NOSEBLEE,SPACY
UNDERFUNDED,UNDERFUNDE,SPACY
UNGRADED,UNGRADE,SPACY
UNREDUCED,UNREDUCE,SPACY
SIDEBURNS,SIDEBURN,SPACY
SICKBED,SICKBE,SPACY
FASCES,FASCE,SPACY
AVIONICS,AVIONIC,SPACY
CRENELATED,CRENELATE,SPACY
NATES,NATE,SPACY
UNREGARDED,UNREGARDE,SPACY
UNRECONSTRUCTED,UNRECONSTRUCTE,SPACY
BRITCHES,BRITCHE,SPACY
DEDANS,DEDAN,SPACY
PARATROOPS,PARATROOP,SPACY
FINEGRAINED,FINEGRAINE,SPACY
GREAVES,GREAVE,SPACY
SUBSCRIPTED,SUBSCRIPTE,SPACY
LUES,LUE,SPACY
BIOMETRICS,BIOMETRIC,SPACY
ARIES,ARIE,SPACY
GASWORKS,GASWORK,SPACY
BULLETED,BULLETE,SPACY
HEARTSTRINGS,HEARTSTRING,SPACY
INCREMENTING,INCREMENTE,SPACY
UNCLEARED,UNCLEARE,SPACY
CONSOLS,CONSOL,SPACY
MUDFLATS,MUDFLAT,SPACY
BADLANDS,BADLAND,SPACY
TALIPES,TALIPE,SPACY
LANCINATING,LANCINATE,SPACY
UNACCOMPLISHED,UNACCOMPLISHE,SPACY
TESSELLATED,TESSELLATE,SPACY
SEMIFINISHED,SEMIFINISHE,SPACY
UNAVENGED,UNAVENGE,SPACY
SOAPSUDS,SOAPSUD,SPACY
YEATS,YEAT,SPACY
TELEMATICS,TELEMATIC,SPACY
UNNOTED,UNNOTE,SPACY
MEALIES,MEALIE,SPACY
PARIES,PARIE,SPACY
AUROCHS,AUROCH,SPACY
AGOING,AGOE,SPACY
ODDMENTS,ODDMENT,SPACY
CLEARHEADED,CLEARHEADE,SPACY
REDOUBTED,REDOUBTE,SPACY
IVIED,IVIE,SPACY
PINNIPED,PINNIPE,SPACY
DEFATTED,DEFATTE,SPACY
DECAFFEINATED,DECAFFEINATE,SPACY
NINEPINS,NINEPIN,SPACY
CAMPHORATED,CAMPHORATE,SPACY
GLASSWORKS,GLASSWORK,SPACY
SORITES,SORITE,SPACY
AFFOREST,AFFOR,SPACY
DISSAVING,DISSAVE,SPACY
UNADVISED,UNADVISE,SPACY
UNRECLAIMED,UNRECLAIME,SPACY
LARES,LARE,SPACY
LEVELHEADED,LEVELHEADE,SPACY
SWEATPANTS,SWEATPANT,SPACY
LOTOS,LOTO,SPACY
GIBLETS,GIBLET,SPACY
UNBOWED,UNBOWE,SPACY
UNPROMPTED,UNPROMPTE,SPACY
ABSCESSED,ABSCESSE,SPACY
NODULATED,NODULATE,SPACY
RUBENS,RUBEN,SPACY
UNPAGED,UNPAGE,SPACY
CALENDS,CALEND,SPACY
TRUNKED,TRUNKE,SPACY
TROUSERED,TROUSERE,SPACY
PENATES,PENATE,SPACY
COMBINATORICS,COMBINATORIC,SPACY
TRESSED,TRESSE,SPACY
PARTICOLOURED,PARTICOLOURE,SPACY
UNENCRYPTED,UNENCRYPTE,SPACY
ASTRONAUTICS,ASTRONAUTIC,SPACY
HYDROPONICS,HYDROPONIC,SPACY
UNFORMATTED,UNFORMATTE,SPACY
SEMIDETACHED,SEMIDETACHE,SPACY
BONKERS,BONKER,SPACY
UNDIES,UNDIE,SPACY
EPS,EP,SPACY
GIMBALS,GIMBAL,SPACY
BALCONIED,BALCONIE,SPACY
SALTWORKS,SALTWORK,SPACY
UNPLEDGED,UNPLEDGE,SPACY
PREDESIGNED,PREDESIGNE,SPACY
NFS,NF,SPACY
UNDERBRED,UNDERBRE,SPACY
PRECOMPILED,PRECOMPILE,SPACY
KALENDS,KALEND,SPACY
LITOTES,LITOTE,SPACY
INDIGESTED,INDIGESTE,SPACY
CITS,CIT,SPACY
UNPRICED,UNPRICE,SPACY
PINCHERS,PINCHER,SPACY
CANCELLATED,CANCELLATE,SPACY
CHITTERLINGS,CHITTERLING,SPACY
DIBS,DIB,SPACY
RIGHTWARDS,RIGHTWARD,SPACY
CONVENANCES,CONVENANCE,SPACY
INTERALLIED,INTERALLIE,SPACY
FLINDERS,FLINDER,SPACY
CRANNIED,CRANNIE,SPACY
HOMEBRED,HOMEBRE,SPACY
HIGHBRED,HIGHBRE,SPACY
UNRULED,UNRULE,SPACY
FOREHANDED,FOREHANDE,SPACY
PREPACKED,PREPACKE,SPACY
UNWISHED,UNWISHE,SPACY
ENTREMETS,ENTREMET,SPACY
ESTOVERS,ESTOVER,SPACY
ANGELES,ANGELE,SPACY
DAISIED,DAISIE,SPACY
UPRATED,UPRATE,SPACY
THIGHED,THIGHE,SPACY
TURPS,TURP,SPACY
WEAZENED,WEAZENE,SPACY
EFFING,EFF,SPACY
HOLS,HOL,SPACY
JIGGERED,JIGGERE,SPACY
SOCRATES,SOCRATE,SPACY
AUDITORIES,AUDITORIE,SPACY
AMBAGES,AMBAGE,SPACY
DOITED,DOITE,SPACY
BIONICS,BIONIC,SPACY
UNREFERENCED,UNREFERENCE,SPACY
EXEQUIES,EXEQUIE,SPACY
CERASTES,CERASTE,SPACY
SEMIMANUFACTURES,SEMIMANUFACTURE,SPACY
GALLUSES,GALLUS,SPACY
RERECORDED,RERECORDE,SPACY
TELESALES,TELESALE,SPACY
MICROGRAPHICS,MICROGRAPHIC,SPACY
SIEMENS,SIEMEN,SPACY
ZOUNDS,ZOUND,SPACY
SEMIFIXED,SEMIFIXE,SPACY
UNDIVERTED,UNDIVERTE,SPACY
SANIES,SANIE,SPACY
BREECHING,BREECHE,SPACY
MENTHOLATED,MENTHOLATE,SPACY
PANTALETS,PANTALET,SPACY
CRUDITES,CRUDITE,SPACY
TRAPES,TRAPE,SPACY
PIXILATED,PIXILATE,SPACY
BOOTES,BOOTE,SPACY
UNPOSTED,UNPOSTE,SPACY
HANTS,HANT,SPACY
UNDETAILED,UNDETAILE,SPACY
HAVINGS,HAVING,SPACY
OUTGIVING,OUTGIVE,SPACY
UNCOMPLEMENTED,UNCOMPLEMENTE,SPACY
PRATIES,PRATIE,SPACY
ELEVENSES,ELEVENSE,SPACY
UNENLIVENED,UNENLIVENE,SPACY
NANTES,NANTE,SPACY
AFFINED,AFFINE,SPACY
NONNESTED,NONNESTE,SPACY
FALLOWING,FALLOWE,SPACY
HYDROMECHANICS,HYDROMECHANIC,SPACY
CLIVERS,CLIVER,SPACY
UNICES,UNICE,SPACY
GRAMMATICS,GRAMMATIC,SPACY
PRAPS,PRAP,SPACY
INTERWORKING,INTERWORKE,SPACY
HERCULES,HERCULE,SPACY
BIGHEADED,BIGHEADE,SPACY
KIES,KY,SPACY
NETHERLANDS,NETHERLAND,SPACY
UNBOOKED,UNBOOKE,SPACY
QUINS,QUIN,SPACY
CANNES,CANNE,SPACY
UNNURTURED,UNNURTURE,SPACY
WEDGIES,WEDGIE,SPACY
HANDWORKED,HANDWORKE,SPACY
ANALECTS,ANALECT,SPACY
HERTS,HERT,SPACY
ORLEANS,ORLEAN,SPACY
PESCADORES,PESCADORE,SPACY
ULCERED,ULCERE,SPACY
MISCREATED,MISCREATE,SPACY
UNPRIZED,UNPRIZE,SPACY
SLYBOOTS,SLYBOOT,SPACY
RUNTED,RUNTE,SPACY
REATTRIBUTED,REATTRIBUTE,SPACY
HOUSETRAINED,HOUSETRAINE,SPACY
SOBERSIDES,SOBERSIDE,SPACY
COLESEED,COLESEE,SPACY
BLUCHERS,BLUCHER,SPACY
MUGGINS,MUGGIN,SPACY
UNCRIPPLED,UNCRIPPLE,SPACY
HEPPED,HEPPE,SPACY
WITHINDOORS,WITHINDOOR,SPACY
BEESTINGS,BEESTING,SPACY
FLANDERS,FLANDER,SPACY
DIOGENES,DIOGENE,SPACY
COSMONAUTICS,COSMONAUTIC,SPACY
WHOLEGRAINS,WHOLEGRAIN,SPACY
NEEDMENTS,NEEDMENT,SPACY
ACHATES,ACHATE,SPACY
PRECOMPILING,PRECOMPILE,SPACY
BALUSTERED,BALUSTERE,SPACY
JUGGINS,JUGGIN,SPACY
UNCONFIGURED,UNCONFIGURE,SPACY
SLUGABED,SLUGABE,SPACY
CHARGRILLED,CHARGRILLE,SPACY
GANGES,GANGE,SPACY
FLATWAYS,FLATWAY,SPACY
CHAMPERS,CHAMPER,SPACY
GOLDILOCKS,GOLDILOCK,SPACY
REIMS,REIM,SPACY
REIMPORTING,REIMPORTE,SPACY
EMOTIONED,EMOTIONE,SPACY
AIRBED,AIRBE,SPACY
GIGAFLOPS,GIGAFLOP,SPACY
YONKS,YONK,SPACY
CASALS,CASAL,SPACY
ROCKIES,ROCKIE,SPACY
ORESTES,ORESTE,SPACY
REMAPPING,REMAPPE,SPACY
EBONICS,EBONIC,SPACY
BRUGES,BRUGE,SPACY
JANKERS,JANKER,SPACY
NOTTS,NOTT,SPACY
PROCRUSTES,PROCRUSTE,SPACY
MULTISCALED,MULTISCALE,SPACY
AGROTECHNICS,AGROTECHNIC,SPACY
WAYGOING,WAYGOE,SPACY
GENDERING,GENDERE,SPACY
TELEMECHANICS,TELEMECHANIC,SPACY
DEGATING,DEGATE,SPACY
THAMES,THAME,SPACY
LOWLIVED,LOWLIVE,SPACY
REEDING,REEDE,SPACY
INTERCROSSING,INTERCROSSE,SPACY
UNDEDUCTED,UNDEDUCTE,SPACY
AGOGICS,AGOGIC,SPACY
UNATTENDING,UNATTENDE,SPACY
OVERMASTED,OVERMASTE,SPACY
GILES,GILE,SPACY
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
LUDDITES,LUDDITE,SPACY
SCURVIED,SCURVIE,SPACY
REBREAKING,REBREAKE,SPACY
KEATS,KEAT,SPACY
CERVANTES,CERVANTE,SPACY
UNCONDONED,UNCONDONE,SPACY
DESCARTES,DESCARTE,SPACY
BEJABERS,BEJABER,SPACY
VIDEOGRAPHICS,VIDEOGRAPHIC,SPACY
EURIPIDES,EURIPIDE,SPACY
UNPERJURED,UNPERJURE,SPACY
LAERTES,LAERTE,SPACY
OVERCOLLECTED,OVERCOLLECTE,SPACY
AMPHIBRACHYS,AMPHIBRACHY,SPACY
CHEOPS,CHEOP,SPACY
CHALONS,CHALON,SPACY
VERSICOLOURED,VERSICOLOURE,SPACY
SUBPARTITIONED,SUBPARTITIONE,SPACY
BALBUTIES,BALBUTIE,SPACY
ARCHIMEDES,ARCHIMEDE,SPACY
GATELEGGED,GATELEGGE,SPACY
POITIERS,POITIER,SPACY
HAVERING,HAVERE,SPACY
THEBES,THEBE,SPACY
SEVRES,SEVRE,SPACY
PERICLES,PERICLE,SPACY
LIMOGES,LIMOGE,SPACY
EVENTING,EVENTE,SPACY
FATBITS,FATBIT,SPACY
HUTTING,HUTTE,SPACY
DOGSHORES,DOGSHORE,SPACY
OVERBADING,OVERBADE,SPACY
AZORES,AZORE,SPACY
BLEWITS,BLEWIT,SPACY
HIPOCRATES,HIPOCRATE,SPACY
AMIENS,AMIEN,SPACY
GUTTING,GUTTE,SPACY
GLADYS,GLADY,SPACY
CHADDED,CHADDE,SPACY
EUPHRATES,EUPHRATE,SPACY
TROWING,TROWE,SPACY
LACEUPS,LACEUP,SPACY
ALIPED,ALIPE,SPACY
TALIPED,TALIPE,SPACY
RAMSES,RAMSE,SPACY
CENTRONICS,CENTRONIC,SPACY
BANTING,BANTE,SPACY
TELEPHOTOLENS,TELEPHOTOLEN,SPACY
ARAKS,ARAK,SPACY
DONETS,DONET,SPACY
CEROPLASTICS,CEROPLASTIC,SPACY
BAYNETWORKS,BAYNETWORK,SPACY
NORWARDS,NORWARD,SPACY
HAPPING,HAPPE,SPACY
BARENTS,BARENT,SPACY
ABLINGS,ABLING,SPACY
CELLING,CELLE,SPACY
CELEBES,CELEBE,SPACY
NENETS,NENET,SPACY
IMPING,IMPE,SPACY
LINARES,LINARE,SPACY
VAILING,VAILE,SPACY
HABDABS,HABDAB,SPACY
RELISTING,RELISTE,SPACY
HOUGHING,HOUGHE,SPACY
1 WORD ATTEMPTED_LEMMATIZATION LEMMATIZER
2 THEMSELVES THEMSELVE SPACY
3 PERHAPS PERHAP SPACY
4 SERIES SERIE SPACY
5 OURSELVES OURSELVE SPACY
6 EXCEED EXCEE SPACY
7 BLEED BLEE SPACY
8 MATHEMATICS MATHEMATIC SPACY
9 NAKED NAKE SPACY
10 SKILLED SKILLE SPACY
11 BELOVED BELOVE SPACY
12 LEST L SPACY
13 WICKED WICKE SPACY
14 EMBED EMBE SPACY
15 DIABETES DIABETE SPACY
16 ONGOING ONGOE SPACY
17 ASHAMED ASHAME SPACY
18 CREED CREE SPACY
19 SINNER SINN SPACY
20 INDEBTED INDEBTE SPACY
21 UNCHANGED UNCHANGE SPACY
22 UNPUBLISHED UNPUBLISHE SPACY
23 UNEMPLOYED UNEMPLOYE SPACY
24 FORTHCOMING FORTHCOME SPACY
25 METAPHYSICS METAPHYSIC SPACY
26 TROUSERS TROUSER SPACY
27 UNAFFECTED UNAFFECTE SPACY
28 RENOWNED RENOWNE SPACY
29 TALENTED TALENTE SPACY
30 GREED GREE SPACY
31 UNFINISHED UNFINISHE SPACY
32 AESTHETICS AESTHETIC SPACY
33 INFRARED INFRARE SPACY
34 DISINTERESTED DISINTERESTE SPACY
35 UNNOTICED UNNOTICE SPACY
36 TING TE SPACY
37 ANNALS ANNAL SPACY
38 OUTSKIRTS OUTSKIRT SPACY
39 DETEST DET SPACY
40 FETTER FETT SPACY
41 SIDEWAYS SIDEWAY SPACY
42 ALMS ALM SPACY
43 MEASLES MEASLE SPACY
44 UNRESTRICTED UNRESTRICTE SPACY
45 ARREARS ARREAR SPACY
46 UNDEVELOPED UNDEVELOPE SPACY
47 CARIES CARIE SPACY
48 MORES MORE SPACY
49 UNALTERED UNALTERE SPACY
50 UNPROTECTED UNPROTECTE SPACY
51 UNEDUCATED UNEDUCATE SPACY
52 GALLOWS GALLOW SPACY
53 UNDATED UNDATE SPACY
54 UNNAMED UNNAME SPACY
55 MONIES MONIE SPACY
56 UNAIDED UNAIDE SPACY
57 UNQUESTIONED UNQUESTIONE SPACY
58 IMBED IMBE SPACY
59 AMINES AMINE SPACY
60 GRASSROOTS GRASSROOT SPACY
61 ACCURSED ACCURSE SPACY
62 UNDECIDED UNDECIDE SPACY
63 UNCHECKED UNCHECKE SPACY
64 UNCOMPLICATED UNCOMPLICATE SPACY
65 BELATED BELATE SPACY
66 UNDETERMINED UNDETERMINE SPACY
67 EAVES EAVE SPACY
68 DISAFFECTED DISAFFECT SPACY
69 UNFULFILLED UNFULFILLE SPACY
70 STRIATED STRIATE SPACY
71 RICKETS RICKET SPACY
72 BICEPS BICEP SPACY
73 DEATHBED DEATHBE SPACY
74 RABIES RABIE SPACY
75 UNATTENDED UNATTENDE SPACY
76 UNABATED UNABATE SPACY
77 MANNERED MANNERE SPACY
78 FAECES FAECE SPACY
79 QUADRUPED QUADRUPE SPACY
80 UNSTRUCTURED UNSTRUCTURE SPACY
81 UNAVAILING UNAVAILE SPACY
82 SUBSPECIES SUBSPECIE SPACY
83 UNDETECTED UNDETECTE SPACY
84 UNPLANNED UNPLANNE SPACY
85 UNCONDITIONED UNCONDITIONE SPACY
86 INBRED INBREED SPACY
87 TONGS TONG SPACY
88 BIGOTED BIGOTE SPACY
89 ENTRAILS ENTRAIL SPACY
90 UNEQUALLED UNEQUALLE SPACY
91 ONCOMING ONCOME SPACY
92 UNTHINKING UNTHINKE SPACY
93 MENSES MENSE SPACY
94 UNMITIGATED UNMITIGATE SPACY
95 UNRECORDED UNRECORDE SPACY
96 SEMIOTICS SEMIOTIC SPACY
97 UNACKNOWLEDGED UNACKNOWLEDGE SPACY
98 UNDISGUISED UNDISGUISE SPACY
99 PYRITES PYRITE SPACY
100 UNOBSTRUCTED UNOBSTRUCTE SPACY
101 UNATTACHED UNATTACHE SPACY
102 UNMATCHED UNMATCHE SPACY
103 PLIERS PLIER SPACY
104 ENAMORED ENAMOR SPACY
105 PANTIES PANTIE SPACY
106 BACKWOODS BACKWOOD SPACY
107 UNPROVOKED UNPROVOKE SPACY
108 TRICEPS TRICEP SPACY
109 UNCHARTED UNCHARTE SPACY
110 MALNOURISHED MALNOURISH SPACY
111 MONEYED MONEYE SPACY
112 UNTAMED UNTAME SPACY
113 METHYLATED METHYLATE SPACY
114 UNRELIEVED UNRELIEVE SPACY
115 UNLETTERED UNLETTERE SPACY
116 HOTBED HOTBE SPACY
117 UNIMPROVED UNIMPROVE SPACY
118 LOPSIDED LOPSIDE SPACY
119 LEGGING LEGGE SPACY
120 UNFEIGNED UNFEIGNE SPACY
121 UNTESTED UNTESTE SPACY
122 UNBLEMISHED UNBLEMISHE SPACY
123 TECHNICS TECHNIC SPACY
124 UNTAINTED UNTAINTE SPACY
125 UNREGISTERED UNREGISTERE SPACY
126 UNFORMED UNFORME SPACY
127 OVERWEENING OVERWEENE SPACY
128 UNPROVIDED UNPROVIDE SPACY
129 FOOTLIGHTS FOOTLIGHT SPACY
130 UNCONVERTED UNCONVERTE SPACY
131 OBSEQUIES OBSEQUIE SPACY
132 PINCERS PINCER SPACY
133 MALADJUSTED MALADJUSTE SPACY
134 ISOSCELES ISOSCELE SPACY
135 UNPROVED UNPROVE SPACY
136 AMIDSHIPS AMIDSHIP SPACY
137 SEMISKILLED SEMISKILLE SPACY
138 UNDIRECTED UNDIRECTE SPACY
139 TABES TABE SPACY
140 UNCLAIMED UNCLAIME SPACY
141 UNPOLISHED UNPOLISHE SPACY
142 FARSIGHTED FARSIGHTE SPACY
143 FAUCES FAUCE SPACY
144 NONCOMMISSIONED NONCOMMISSIONE SPACY
145 UNCHARGED UNCHARGE SPACY
146 CONGERIES CONGERIE SPACY
147 SCABIES SCABIE SPACY
148 MALFORMED MALFORME SPACY
149 INFORMATICS INFORMATIC SPACY
150 INCREMENTED INCREMENTE SPACY
151 UNDISTRIBUTED UNDISTRIBUTE SPACY
152 HYDRODYNAMICS HYDRODYNAMIC SPACY
153 ANTIPODES ANTIPODE SPACY
154 UNDEREMPLOYED UNDEREMPLOYE SPACY
155 BIPED BIPE SPACY
156 ELECTRODYNAMICS ELECTRODYNAMIC SPACY
157 FEEBLEMINDED FEEBLEMINDE SPACY
158 SUDS SUD SPACY
159 UNDERSIZED UNDERSIZE SPACY
160 HUSTINGS HUSTING SPACY
161 STOREYED STOREYE SPACY
162 UNREFINED UNREFINE SPACY
163 UNTURNED UNTURNE SPACY
164 SERRIED SERRIE SPACY
165 DOLDRUMS DOLDRUM SPACY
166 STATS STAT SPACY
167 GULES GULE SPACY
168 UNDERPANTS UNDERPANT SPACY
169 UNREWARDING UNREWARDE SPACY
170 CALIPERS CALIPER SPACY
171 ONESIDED ONESIDE SPACY
172 UNABRIDGED UNABRIDGE SPACY
173 UNFURNISHED UNFURNISHE SPACY
174 UNREDEEMED UNREDEEME SPACY
175 UNSEEING UNSEEE SPACY
176 KNICKERS KNICKER SPACY
177 UNLISTED UNLISTE SPACY
178 INNARDS INNARD SPACY
179 GLANDERS GLANDER SPACY
180 OVEREXTENDED OVEREXTEND SPACY
181 RELATIVIZED RELATIVIZE SPACY
182 UNFUNDED UNFUNDE SPACY
183 COLUMNED COLUMNE SPACY
184 CALISTHENICS CALISTHENIC SPACY
185 SUPERFICIES SUPERFICIE SPACY
186 CASTELLATED CASTELLATE SPACY
187 PUREBRED PUREBRE SPACY
188 CORTES CORTE SPACY
189 REDHEADED REDHEADE SPACY
190 CASTANETS CASTANET SPACY
191 UNPRACTISED UNPRACTISE SPACY
192 UNEDITED UNEDITE SPACY
193 LIVERIED LIVERIE SPACY
194 NOSEBLEED NOSEBLEE SPACY
195 UNDERFUNDED UNDERFUNDE SPACY
196 UNGRADED UNGRADE SPACY
197 UNREDUCED UNREDUCE SPACY
198 SIDEBURNS SIDEBURN SPACY
199 SICKBED SICKBE SPACY
200 FASCES FASCE SPACY
201 AVIONICS AVIONIC SPACY
202 CRENELATED CRENELATE SPACY
203 NATES NATE SPACY
204 UNREGARDED UNREGARDE SPACY
205 UNRECONSTRUCTED UNRECONSTRUCTE SPACY
206 BRITCHES BRITCHE SPACY
207 DEDANS DEDAN SPACY
208 PARATROOPS PARATROOP SPACY
209 FINEGRAINED FINEGRAINE SPACY
210 GREAVES GREAVE SPACY
211 SUBSCRIPTED SUBSCRIPTE SPACY
212 LUES LUE SPACY
213 BIOMETRICS BIOMETRIC SPACY
214 ARIES ARIE SPACY
215 GASWORKS GASWORK SPACY
216 BULLETED BULLETE SPACY
217 HEARTSTRINGS HEARTSTRING SPACY
218 INCREMENTING INCREMENTE SPACY
219 UNCLEARED UNCLEARE SPACY
220 CONSOLS CONSOL SPACY
221 MUDFLATS MUDFLAT SPACY
222 BADLANDS BADLAND SPACY
223 TALIPES TALIPE SPACY
224 LANCINATING LANCINATE SPACY
225 UNACCOMPLISHED UNACCOMPLISHE SPACY
226 TESSELLATED TESSELLATE SPACY
227 SEMIFINISHED SEMIFINISHE SPACY
228 UNAVENGED UNAVENGE SPACY
229 SOAPSUDS SOAPSUD SPACY
230 YEATS YEAT SPACY
231 TELEMATICS TELEMATIC SPACY
232 UNNOTED UNNOTE SPACY
233 MEALIES MEALIE SPACY
234 PARIES PARIE SPACY
235 AUROCHS AUROCH SPACY
236 AGOING AGOE SPACY
237 ODDMENTS ODDMENT SPACY
238 CLEARHEADED CLEARHEADE SPACY
239 REDOUBTED REDOUBTE SPACY
240 IVIED IVIE SPACY
241 PINNIPED PINNIPE SPACY
242 DEFATTED DEFATTE SPACY
243 DECAFFEINATED DECAFFEINATE SPACY
244 NINEPINS NINEPIN SPACY
245 CAMPHORATED CAMPHORATE SPACY
246 GLASSWORKS GLASSWORK SPACY
247 SORITES SORITE SPACY
248 AFFOREST AFFOR SPACY
249 DISSAVING DISSAVE SPACY
250 UNADVISED UNADVISE SPACY
251 UNRECLAIMED UNRECLAIME SPACY
252 LARES LARE SPACY
253 LEVELHEADED LEVELHEADE SPACY
254 SWEATPANTS SWEATPANT SPACY
255 LOTOS LOTO SPACY
256 GIBLETS GIBLET SPACY
257 UNBOWED UNBOWE SPACY
258 UNPROMPTED UNPROMPTE SPACY
259 ABSCESSED ABSCESSE SPACY
260 NODULATED NODULATE SPACY
261 RUBENS RUBEN SPACY
262 UNPAGED UNPAGE SPACY
263 CALENDS CALEND SPACY
264 TRUNKED TRUNKE SPACY
265 TROUSERED TROUSERE SPACY
266 PENATES PENATE SPACY
267 COMBINATORICS COMBINATORIC SPACY
268 TRESSED TRESSE SPACY
269 PARTICOLOURED PARTICOLOURE SPACY
270 UNENCRYPTED UNENCRYPTE SPACY
271 ASTRONAUTICS ASTRONAUTIC SPACY
272 HYDROPONICS HYDROPONIC SPACY
273 UNFORMATTED UNFORMATTE SPACY
274 SEMIDETACHED SEMIDETACHE SPACY
275 BONKERS BONKER SPACY
276 UNDIES UNDIE SPACY
277 EPS EP SPACY
278 GIMBALS GIMBAL SPACY
279 BALCONIED BALCONIE SPACY
280 SALTWORKS SALTWORK SPACY
281 UNPLEDGED UNPLEDGE SPACY
282 PREDESIGNED PREDESIGNE SPACY
283 NFS NF SPACY
284 UNDERBRED UNDERBRE SPACY
285 PRECOMPILED PRECOMPILE SPACY
286 KALENDS KALEND SPACY
287 LITOTES LITOTE SPACY
288 INDIGESTED INDIGESTE SPACY
289 CITS CIT SPACY
290 UNPRICED UNPRICE SPACY
291 PINCHERS PINCHER SPACY
292 CANCELLATED CANCELLATE SPACY
293 CHITTERLINGS CHITTERLING SPACY
294 DIBS DIB SPACY
295 RIGHTWARDS RIGHTWARD SPACY
296 CONVENANCES CONVENANCE SPACY
297 INTERALLIED INTERALLIE SPACY
298 FLINDERS FLINDER SPACY
299 CRANNIED CRANNIE SPACY
300 HOMEBRED HOMEBRE SPACY
301 HIGHBRED HIGHBRE SPACY
302 UNRULED UNRULE SPACY
303 FOREHANDED FOREHANDE SPACY
304 PREPACKED PREPACKE SPACY
305 UNWISHED UNWISHE SPACY
306 ENTREMETS ENTREMET SPACY
307 ESTOVERS ESTOVER SPACY
308 ANGELES ANGELE SPACY
309 DAISIED DAISIE SPACY
310 UPRATED UPRATE SPACY
311 THIGHED THIGHE SPACY
312 TURPS TURP SPACY
313 WEAZENED WEAZENE SPACY
314 EFFING EFF SPACY
315 HOLS HOL SPACY
316 JIGGERED JIGGERE SPACY
317 SOCRATES SOCRATE SPACY
318 AUDITORIES AUDITORIE SPACY
319 AMBAGES AMBAGE SPACY
320 DOITED DOITE SPACY
321 BIONICS BIONIC SPACY
322 UNREFERENCED UNREFERENCE SPACY
323 EXEQUIES EXEQUIE SPACY
324 CERASTES CERASTE SPACY
325 SEMIMANUFACTURES SEMIMANUFACTURE SPACY
326 GALLUSES GALLUS SPACY
327 RERECORDED RERECORDE SPACY
328 TELESALES TELESALE SPACY
329 MICROGRAPHICS MICROGRAPHIC SPACY
330 SIEMENS SIEMEN SPACY
331 ZOUNDS ZOUND SPACY
332 SEMIFIXED SEMIFIXE SPACY
333 UNDIVERTED UNDIVERTE SPACY
334 SANIES SANIE SPACY
335 BREECHING BREECHE SPACY
336 MENTHOLATED MENTHOLATE SPACY
337 PANTALETS PANTALET SPACY
338 CRUDITES CRUDITE SPACY
339 TRAPES TRAPE SPACY
340 PIXILATED PIXILATE SPACY
341 BOOTES BOOTE SPACY
342 UNPOSTED UNPOSTE SPACY
343 HANTS HANT SPACY
344 UNDETAILED UNDETAILE SPACY
345 HAVINGS HAVING SPACY
346 OUTGIVING OUTGIVE SPACY
347 UNCOMPLEMENTED UNCOMPLEMENTE SPACY
348 PRATIES PRATIE SPACY
349 ELEVENSES ELEVENSE SPACY
350 UNENLIVENED UNENLIVENE SPACY
351 NANTES NANTE SPACY
352 AFFINED AFFINE SPACY
353 NONNESTED NONNESTE SPACY
354 FALLOWING FALLOWE SPACY
355 HYDROMECHANICS HYDROMECHANIC SPACY
356 CLIVERS CLIVER SPACY
357 UNICES UNICE SPACY
358 GRAMMATICS GRAMMATIC SPACY
359 PRAPS PRAP SPACY
360 INTERWORKING INTERWORKE SPACY
361 HERCULES HERCULE SPACY
362 BIGHEADED BIGHEADE SPACY
363 KIES KY SPACY
364 NETHERLANDS NETHERLAND SPACY
365 UNBOOKED UNBOOKE SPACY
366 QUINS QUIN SPACY
367 CANNES CANNE SPACY
368 UNNURTURED UNNURTURE SPACY
369 WEDGIES WEDGIE SPACY
370 HANDWORKED HANDWORKE SPACY
371 ANALECTS ANALECT SPACY
372 HERTS HERT SPACY
373 ORLEANS ORLEAN SPACY
374 PESCADORES PESCADORE SPACY
375 ULCERED ULCERE SPACY
376 MISCREATED MISCREATE SPACY
377 UNPRIZED UNPRIZE SPACY
378 SLYBOOTS SLYBOOT SPACY
379 RUNTED RUNTE SPACY
380 REATTRIBUTED REATTRIBUTE SPACY
381 HOUSETRAINED HOUSETRAINE SPACY
382 SOBERSIDES SOBERSIDE SPACY
383 COLESEED COLESEE SPACY
384 BLUCHERS BLUCHER SPACY
385 MUGGINS MUGGIN SPACY
386 UNCRIPPLED UNCRIPPLE SPACY
387 HEPPED HEPPE SPACY
388 WITHINDOORS WITHINDOOR SPACY
389 BEESTINGS BEESTING SPACY
390 FLANDERS FLANDER SPACY
391 DIOGENES DIOGENE SPACY
392 COSMONAUTICS COSMONAUTIC SPACY
393 WHOLEGRAINS WHOLEGRAIN SPACY
394 NEEDMENTS NEEDMENT SPACY
395 ACHATES ACHATE SPACY
396 PRECOMPILING PRECOMPILE SPACY
397 BALUSTERED BALUSTERE SPACY
398 JUGGINS JUGGIN SPACY
399 UNCONFIGURED UNCONFIGURE SPACY
400 SLUGABED SLUGABE SPACY
401 CHARGRILLED CHARGRILLE SPACY
402 GANGES GANGE SPACY
403 FLATWAYS FLATWAY SPACY
404 CHAMPERS CHAMPER SPACY
405 GOLDILOCKS GOLDILOCK SPACY
406 REIMS REIM SPACY
407 REIMPORTING REIMPORTE SPACY
408 EMOTIONED EMOTIONE SPACY
409 AIRBED AIRBE SPACY
410 GIGAFLOPS GIGAFLOP SPACY
411 YONKS YONK SPACY
412 CASALS CASAL SPACY
413 ROCKIES ROCKIE SPACY
414 ORESTES ORESTE SPACY
415 REMAPPING REMAPPE SPACY
416 EBONICS EBONIC SPACY
417 BRUGES BRUGE SPACY
418 JANKERS JANKER SPACY
419 NOTTS NOTT SPACY
420 PROCRUSTES PROCRUSTE SPACY
421 MULTISCALED MULTISCALE SPACY
422 AGROTECHNICS AGROTECHNIC SPACY
423 WAYGOING WAYGOE SPACY
424 GENDERING GENDERE SPACY
425 TELEMECHANICS TELEMECHANIC SPACY
426 DEGATING DEGATE SPACY
427 THAMES THAME SPACY
428 LOWLIVED LOWLIVE SPACY
429 REEDING REEDE SPACY
430 INTERCROSSING INTERCROSSE SPACY
431 UNDEDUCTED UNDEDUCTE SPACY
432 AGOGICS AGOGIC SPACY
433 UNATTENDING UNATTENDE SPACY
434 OVERMASTED OVERMASTE SPACY
435 GILES GILE SPACY
436 NONCOPYRIGHTED NONCOPYRIGHTE SPACY
437 LUDDITES LUDDITE SPACY
438 SCURVIED SCURVIE SPACY
439 REBREAKING REBREAKE SPACY
440 KEATS KEAT SPACY
441 CERVANTES CERVANTE SPACY
442 UNCONDONED UNCONDONE SPACY
443 DESCARTES DESCARTE SPACY
444 BEJABERS BEJABER SPACY
445 VIDEOGRAPHICS VIDEOGRAPHIC SPACY
446 EURIPIDES EURIPIDE SPACY
447 UNPERJURED UNPERJURE SPACY
448 LAERTES LAERTE SPACY
449 OVERCOLLECTED OVERCOLLECTE SPACY
450 AMPHIBRACHYS AMPHIBRACHY SPACY
451 CHEOPS CHEOP SPACY
452 CHALONS CHALON SPACY
453 VERSICOLOURED VERSICOLOURE SPACY
454 SUBPARTITIONED SUBPARTITIONE SPACY
455 BALBUTIES BALBUTIE SPACY
456 ARCHIMEDES ARCHIMEDE SPACY
457 GATELEGGED GATELEGGE SPACY
458 POITIERS POITIER SPACY
459 HAVERING HAVERE SPACY
460 THEBES THEBE SPACY
461 SEVRES SEVRE SPACY
462 PERICLES PERICLE SPACY
463 LIMOGES LIMOGE SPACY
464 EVENTING EVENTE SPACY
465 FATBITS FATBIT SPACY
466 HUTTING HUTTE SPACY
467 DOGSHORES DOGSHORE SPACY
468 OVERBADING OVERBADE SPACY
469 AZORES AZORE SPACY
470 BLEWITS BLEWIT SPACY
471 HIPOCRATES HIPOCRATE SPACY
472 AMIENS AMIEN SPACY
473 GUTTING GUTTE SPACY
474 GLADYS GLADY SPACY
475 CHADDED CHADDE SPACY
476 EUPHRATES EUPHRATE SPACY
477 TROWING TROWE SPACY
478 LACEUPS LACEUP SPACY
479 ALIPED ALIPE SPACY
480 TALIPED TALIPE SPACY
481 RAMSES RAMSE SPACY
482 CENTRONICS CENTRONIC SPACY
483 BANTING BANTE SPACY
484 TELEPHOTOLENS TELEPHOTOLEN SPACY
485 ARAKS ARAK SPACY
486 DONETS DONET SPACY
487 CEROPLASTICS CEROPLASTIC SPACY
488 BAYNETWORKS BAYNETWORK SPACY
489 NORWARDS NORWARD SPACY
490 HAPPING HAPPE SPACY
491 BARENTS BARENT SPACY
492 ABLINGS ABLING SPACY
493 CELLING CELLE SPACY
494 CELEBES CELEBE SPACY
495 NENETS NENET SPACY
496 IMPING IMPE SPACY
497 LINARES LINARE SPACY
498 VAILING VAILE SPACY
499 HABDABS HABDAB SPACY
500 RELISTING RELISTE SPACY
501 HOUGHING HOUGHE SPACY

View File

@ -1,122 +0,0 @@
word,lemmatized_word
the,THE
of,OF
to,TO
in,IN
is,BE
that,THAT
for,FOR
be,BE
by,BY
with,WITH
on,ON
not,NOT
this,THIS
are,BE
at,AT
from,FROM
he,HE
which,WHICH
his,HIS
have,HAVE
an,AN
but,BUT
you,YOU
they,THEY
were,BE
had,HAVE
we,WE
all,ALL
one,ONE
their,THEIR
been,BE
will,WILL
there,THERE
can,CAN
if,IF
other,OTHER
would,WOULD
no,NO
her,SHE
may,MAY
more,MORE
when,WHEN
who,WHO
such,SUCH
these,THESE
any,ANY
she,SHE
new,NEW
time,TIME
than,THAN
do,DO
some,SOME
what,WHAT
only,ONLY
into,INTO
them,THEY
two,TWO
also,ALSO
about,ABOUT
out,OUT
him,HE
my,MY
said,SAY
up,UP
our,OUR
first,FIRST
should,SHOULD
under,UNDER
made,MAKE
state,STATE
see,SEE
after,AFTER
could,COULD
then,THEN
me,I
most,MOST
over,OVER
very,VERY
your,YOUR
between,BETWEEN
where,WHERE
now,NOW
shall,SHALL
work,WORK
those,THOSE
same,SAME
well,WELL
each,EACH
many,MANY
being,BE
years,YEAR
did,DO
year,YEAR
through,THROUGH
must,MUST
upon,UPON
before,BEFORE
like,LIKE
use,USE
part,PART
general,GENERAL
people,PEOPLE
because,BECAUSE
used,USE
how,HOW
even,EVEN
much,MUCH
states,STATE
during,DURING
both,BOTH
case,CASE
three,THREE
number,NUMBER
make,MAKE
per,PER
great,GREAT
act,ACT
way,WAY
life,LIFE
good,GOOD
day,DAY
1 word lemmatized_word
2 the THE
3 of OF
4 to TO
5 in IN
6 is BE
7 that THAT
8 for FOR
9 be BE
10 by BY
11 with WITH
12 on ON
13 not NOT
14 this THIS
15 are BE
16 at AT
17 from FROM
18 he HE
19 which WHICH
20 his HIS
21 have HAVE
22 an AN
23 but BUT
24 you YOU
25 they THEY
26 were BE
27 had HAVE
28 we WE
29 all ALL
30 one ONE
31 their THEIR
32 been BE
33 will WILL
34 there THERE
35 can CAN
36 if IF
37 other OTHER
38 would WOULD
39 no NO
40 her SHE
41 may MAY
42 more MORE
43 when WHEN
44 who WHO
45 such SUCH
46 these THESE
47 any ANY
48 she SHE
49 new NEW
50 time TIME
51 than THAN
52 do DO
53 some SOME
54 what WHAT
55 only ONLY
56 into INTO
57 them THEY
58 two TWO
59 also ALSO
60 about ABOUT
61 out OUT
62 him HE
63 my MY
64 said SAY
65 up UP
66 our OUR
67 first FIRST
68 should SHOULD
69 under UNDER
70 made MAKE
71 state STATE
72 see SEE
73 after AFTER
74 could COULD
75 then THEN
76 me I
77 most MOST
78 over OVER
79 very VERY
80 your YOUR
81 between BETWEEN
82 where WHERE
83 now NOW
84 shall SHALL
85 work WORK
86 those THOSE
87 same SAME
88 well WELL
89 each EACH
90 many MANY
91 being BE
92 years YEAR
93 did DO
94 year YEAR
95 through THROUGH
96 must MUST
97 upon UPON
98 before BEFORE
99 like LIKE
100 use USE
101 part PART
102 general GENERAL
103 people PEOPLE
104 because BECAUSE
105 used USE
106 how HOW
107 even EVEN
108 much MUCH
109 states STATE
110 during DURING
111 both BOTH
112 case CASE
113 three THREE
114 number NUMBER
115 make MAKE
116 per PER
117 great GREAT
118 act ACT
119 way WAY
120 life LIFE
121 good GOOD
122 day DAY

Binary file not shown.

View File

@ -1,103 +1,70 @@
#!/usr/bin/env python3
# coding: utf-8
print("Step 1")
print("Loading dependencies")
try:
_initialized
except:
# !pip install spacy
# !python -m spacy download en_core_web_trf
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
_initialized=True
import pandas as pd
import spacy
import nltk
from tqdm import tqdm
import gzip
import re
# Wordnet
try:
from nltk.stem.wordnet import WordNetLemmatizer
except:
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
print("Step 2")
# Spacy
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
print("Loading initial wordlist")
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
words = []
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
for line in infile:
words.append(line.decode('ascii').split(",")[0])
# Remove header
words = words[1:]
print(words[0:5])
WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")
print("Lemmatizing words")
seen_lemmatizations = set()
print("Step 3")
with open("./01-errored-lemmatized-words.csv", 'w') as erroutfile:
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
with gzip.open("./01-lemmatized-words.csv.gz", 'w') as outfile:
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
annotated_words=pd.read_excel("annotated_words.ods")
iter = tqdm(words)
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
for word in iter:
lemmatized_words = [
# Wordnet
(wordnet.lemmatize(word).upper(), 'WORDNET'),
# Spacy
(nlp(word)[0].lemma_.upper().upper(), 'SPACY'),
]
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
print("Step 4")
# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Add all custom mappings
for m in list(sum(custom_maps, ())):
if m[0] not in all_words:
all_words.append(m[0])
if m[1] not in all_words:
all_words.append(m[1])
print("Step 5")
# Lemmatize all words (plural -> singular)
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
with open("01-lemmatized-words.csv", "w") as f:
f.write("word,lemmatized_word\n")
iter = tqdm(all_words[:1000])
for w in iter:
lemmatized_word = nlp(w)[0].lemma_.upper()
if lemmatized_word == w:
for (lemmatized_word, lemmatizer) in lemmatized_words:
if word == lemmatized_word:
continue
if lemmatized_word not in all_words:
iter.write(f"{lemmatized_word} not in all_words")
f.write(f"{w},{lemmatized_word}\n")
if (word, lemmatized_word) in seen_lemmatizations:
continue
seen_lemmatizations.add((word, lemmatized_word))
if lemmatized_word not in words:
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
continue
iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")
outfile.write(f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii"))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,103 @@
ADD,ADDS
ADS,ADDS
AFFECTED,EFFECT
AFFECT,EFFECT
AFFECTIONS,AFFECTION
AFFECTIVE,EFFECT
AFFECTS,EFFECT
ALUMINIUM,ALUMINUM
ALUMINUM,ALUMINIUM
ANALYSE,ANALYZE
ANALYSED,ANALYZE
ANALYSES,ANALYZE
AUX,OX
BE,BEE
BERRY,BARRY
BLEW,BLUE
BOT,BOUGHT
BOULDER,BOLDER
BRINGS,BRING
BY,BYE
CAPITOL,CAPITAL
CENTS,SENSE
CHILE,CHILI
CHILE,CHILLY
COLOURLESS,COLORLESS
COM,CALM
CORP,CORE
CORPS,CORE
CUE,QUEUE
DAZE,DAY
DAZED,DAY
DEAR,DEER
DESSERT,DESERT
DEW,DO
DEW,DUE
DIED,DYED
EFFECTIVE,EFFECT
EFFECTS,EFFECT
ELECTRONICS,ELECTRONIC
FAVOUR,FAVOR
FAX,FACTS
FILING,FILLING
FILINGS,FILLING
FORTUNATELY,FORTUNATE
FOUR,FOR
GRATE,GREAT
HAIRY,HARRY
HARRY,HAIRY
HEIR,HAIR
HEIRS,HAIR
HEM,HIM
HONOUR,HONOR
HONOURS,HONORS
HYMN,HIM
HYMNS,HIM
IMPROVES,IMPROVE
ISLE,AISLE
KNIGHT,NIGHT
KNOT,NOT
KNOTS,NOT
LARVAE,LARVA
LECTURER,LECTURE
MANOR,MANNER
MONIES,MONEYS
NEIGHBOURHOOD,NEIGHBORHOOD
NEIGHBOUR,NEIGHBOR
NEIGHBOURS,NEIGHBOR
NOSE,KNOW
NUN,NONE
ORE,OAR
ORE,OR
ORGANISATIONAL,ORGANIZATIONAL
ORGANISATION,ORGANIZATION
ORGANISATIONS,ORGANIZATION
OWE,OH
PAR,PARSE
PARS,PARSE
PEOPLES,PEOPLE
PER,PURR
PETAL,PEDAL
PROVIDES,PROVIDE
RAP,WRAP
REFORMED,REFORM
SCENT,CENT
SCENTS,CENT
SENSE,CENT
SENSED,CENT
SENSES,CENT
SIMULTANEOUSLY,SIMULTANEOUS
TELECOMMUNICATIONS,TELECOMMUNICATION
THEATRES,THEATER
THEATRE,THEATER
THRU,THROUGH
VAPOUR,VAPOR
VARY,VERY
VERTEBRA,VERTEBRAE
WEARY,WARY
WEIGHS,WAY
WEIGH,WAY
YELLOW,HELLO
CACHE,CASH
BYTE,BITE
COUNSELLOR,COUNSELOR
1 ADD ADDS
2 ADS ADDS
3 AFFECTED EFFECT
4 AFFECT EFFECT
5 AFFECTIONS AFFECTION
6 AFFECTIVE EFFECT
7 AFFECTS EFFECT
8 ALUMINIUM ALUMINUM
9 ALUMINUM ALUMINIUM
10 ANALYSE ANALYZE
11 ANALYSED ANALYZE
12 ANALYSES ANALYZE
13 AUX OX
14 BE BEE
15 BERRY BARRY
16 BLEW BLUE
17 BOT BOUGHT
18 BOULDER BOLDER
19 BRINGS BRING
20 BY BYE
21 CAPITOL CAPITAL
22 CENTS SENSE
23 CHILE CHILI
24 CHILE CHILLY
25 COLOURLESS COLORLESS
26 COM CALM
27 CORP CORE
28 CORPS CORE
29 CUE QUEUE
30 DAZE DAY
31 DAZED DAY
32 DEAR DEER
33 DESSERT DESERT
34 DEW DO
35 DEW DUE
36 DIED DYED
37 EFFECTIVE EFFECT
38 EFFECTS EFFECT
39 ELECTRONICS ELECTRONIC
40 FAVOUR FAVOR
41 FAX FACTS
42 FILING FILLING
43 FILINGS FILLING
44 FORTUNATELY FORTUNATE
45 FOUR FOR
46 GRATE GREAT
47 HAIRY HARRY
48 HARRY HAIRY
49 HEIR HAIR
50 HEIRS HAIR
51 HEM HIM
52 HONOUR HONOR
53 HONOURS HONORS
54 HYMN HIM
55 HYMNS HIM
56 IMPROVES IMPROVE
57 ISLE AISLE
58 KNIGHT NIGHT
59 KNOT NOT
60 KNOTS NOT
61 LARVAE LARVA
62 LECTURER LECTURE
63 MANOR MANNER
64 MONIES MONEYS
65 NEIGHBOURHOOD NEIGHBORHOOD
66 NEIGHBOUR NEIGHBOR
67 NEIGHBOURS NEIGHBOR
68 NOSE KNOW
69 NUN NONE
70 ORE OAR
71 ORE OR
72 ORGANISATIONAL ORGANIZATIONAL
73 ORGANISATION ORGANIZATION
74 ORGANISATIONS ORGANIZATION
75 OWE OH
76 PAR PARSE
77 PARS PARSE
78 PEOPLES PEOPLE
79 PER PURR
80 PETAL PEDAL
81 PROVIDES PROVIDE
82 RAP WRAP
83 REFORMED REFORM
84 SCENT CENT
85 SCENTS CENT
86 SENSE CENT
87 SENSED CENT
88 SENSES CENT
89 SIMULTANEOUSLY SIMULTANEOUS
90 TELECOMMUNICATIONS TELECOMMUNICATION
91 THEATRES THEATER
92 THEATRE THEATER
93 THRU THROUGH
94 VAPOUR VAPOR
95 VARY VERY
96 VERTEBRA VERTEBRAE
97 WEARY WARY
98 WEIGHS WAY
99 WEIGH WAY
100 YELLOW HELLO
101 CACHE CASH
102 BYTE BITE
103 COUNSELLOR COUNSELOR

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

126
wordlist/04-deduplicated-words.py Executable file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
# coding: utf-8
import gzip
from pprint import pprint
from tqdm import tqdm
# 2**13 + 2 since two can be skipped
WORDLIST_SIZE=8192+2
print("Loading full wordlist")
all_words = []
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
first = True
for line in infile:
if first:
first = False
continue
all_words.append(line.decode('ascii').split(",")[0])
print("Building lemmatization graph")
lemmatization_graph = list()
def add_lemmatization(word1, word2):
for lemmatization in lemmatization_graph:
word1_contained = word1 in lemmatization
word2_contained = word2 in lemmatization
if word1_contained or word2_contained:
if word1_contained and word2_contained:
print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}")
lemmatization.add(word1)
lemmatization.add(word2)
# Success. We added the words
return
else:
# This lemmatization doesn't contain either. This is the common case
pass
# If we get here, there is no known lemmatization between these two. Add it
lemmatization_graph.append(set((word1, word2)))
def get_lemmatization(word):
for lemmatization in lemmatization_graph:
if word in lemmatization:
return lemmatization
print("\tAdding automatic lemmatizations")
# First, iterate over automated lemmatizations
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
first = True
for line in infile:
if first:
first = False
continue
split = line.decode('ascii').strip().split(",")
add_lemmatization(split[0], split[1])
print("\tAdding custom lemmatizations")
# Next, iterate over manual lemmatizations
with open("./02-custom-lemmatizations.csv") as infile:
first = True
for line in infile:
if first:
first = False
continue
split = line.strip().split(",")
add_lemmatization(split[0], split[1])
print("Lemmatization graph constructed:")
pprint(lemmatization_graph)
print("Loading exclude wordlist")
with open("./03-exclude.csv") as infile:
first = True
exclude_words = set()
for line in infile:
if first:
first = False
continue
exclude_words.add(line.strip())
# Now, start printing the first WORDLIST_SIZE elements
seen_word_lemmatizations = set()
final_wordlist = []
ending_word_index = 0
for word in all_words:
ending_word_index += 1
word_lemmatizations = get_lemmatization(word)
if not word_lemmatizations:
word_lemmatizations = set([word])
if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) :
print(f"Note: {word_lemmatizations} is excluded")
continue
if word_lemmatizations in seen_word_lemmatizations:
# We already added this one
continue
final_wordlist.append(word_lemmatizations)
if len(final_wordlist) >= WORDLIST_SIZE:
# We've added all the words we need
break
assert len(final_wordlist) == WORDLIST_SIZE
pprint(list(enumerate(final_wordlist)))
print(f"Ending index: {ending_word_index}")
final_wordlist = [
(idx + 1, word)
for idx, words in enumerate(final_wordlist)
for word in words
]
with open("./04-deduplicated-words.csv", 'w') as outfile:
outfile.write("WORD,NUMBER\n")
for (idx, word) in final_wordlist:
outfile.write(f"{word},{idx}\n")
# all_words.append(line.decode('ascii').split(",")[0])

Binary file not shown.

View File

@ -6,6 +6,7 @@ asttokens==2.2.1
attrs==22.2.0
backcall==0.2.0
beautifulsoup4==4.11.2
black==23.1.0
bleach==6.0.0
blis==0.7.9
catalogue==2.0.8
@ -33,6 +34,7 @@ ipywidgets==8.0.4
isoduration==20.11.0
jedi==0.18.2
Jinja2==3.1.2
joblib==1.2.0
jsonpointer==2.3
jsonschema==4.17.3
jupyter==1.0.0
@ -49,11 +51,13 @@ MarkupSafe==2.1.2
matplotlib-inline==0.1.6
mistune==2.0.5
murmurhash==1.0.9
mypy-extensions==1.0.0
nbclassic==0.5.2
nbclient==0.7.2
nbconvert==7.2.9
nbformat==5.7.3
nest-asyncio==1.5.6
nltk==3.8.1
notebook==6.5.2
notebook_shim==0.2.2
numpy==1.24.2
@ -66,6 +70,7 @@ packaging==23.0
pandas==1.5.3
pandocfilters==1.5.0
parso==0.8.3
pathspec==0.11.0
pathy==0.10.1
pexpect==4.8.0
pickleshare==0.7.5

File diff suppressed because it is too large Load Diff

View File

@ -1,220 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "991a711f-be98-4aae-a657-84b065449916",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" # !pip install spacy\n",
" # !python -m spacy download en_core_web_trf\n",
" import spacy\n",
" \n",
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
" \n",
" _initialized=True\n",
" \n",
"import pandas as pd\n",
"import gzip\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d130bb84",
"metadata": {},
"outputs": [],
"source": [
"def get_lines(filename):\n",
" with gzip.open(filename, 'r') as f:\n",
" ret = []\n",
" for l in f:\n",
" if len(ret) > 30_000:\n",
" return ret\n",
" ret.append(str(l).lower())\n",
" return ret\n",
"\n",
"\n",
" \n",
"WORDLIST_SIZE = 8192 + 3\n",
"word_re = re.compile(r\"^[A-Za-z]+$\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "de2d1731",
"metadata": {},
"outputs": [],
"source": [
"!pwd\n",
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "90665714",
"metadata": {},
"outputs": [],
"source": [
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
"\n",
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
"excluded_words[0:10]\n",
"\n",
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
"\n",
"custom_maps = [\n",
" (m[1][\"word\"].lower(), mapping.lower())\n",
" for m in custom_maps.iterrows()\n",
" for mapping in m[1][\"maps_to\"]\n",
"]\n",
"custom_maps"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fb50c69e",
"metadata": {},
"outputs": [],
"source": [
"# Start parsing the wordlist\n",
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
"\n",
"# Delete header line\n",
"all_words = all_words[1:]\n",
"\n",
"# Get only the word (fixed width)\n",
"all_words = [w[13:36].strip() for w in all_words]\n",
"\n",
"# Remove special characters\n",
"all_words = [w for w in all_words if word_re.search(w)]\n",
"\n",
"# Remove all removed words\n",
"all_words = [w for w in all_words if w not in excluded_words]\n",
"\n",
"# Add all custom mappings\n",
"for m in list(sum(custom_maps, ())):\n",
" if m[0] not in all_words:\n",
" all_words.append(m[0])\n",
" if m[1] not in all_words:\n",
" all_words.append(m[1])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "cd21bff5",
"metadata": {},
"outputs": [],
"source": [
"# Lemmatize all words (plural -> singular)\n",
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
"print(lemmatize_mappings[:100])\n",
"\n",
"# Add custom lemmatizations\n",
"for l in custom_maps:\n",
" if l in lemmatize_mappings:\n",
" print(f\"Warning: {l} is already lemmatized\")\n",
" else:\n",
" lemmatize_mappings.append(l)\n",
" \n",
"print(lemmatize_mappings[:100])\n",
"\n",
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
"print(lemmatize_mappings[:100])\n",
"\n",
"# Now, re-add all lematized words to the list of every word\n",
"for w in sum(lemmatize_mappings, ()):\n",
" if w not in all_words:\n",
" print(w)\n",
" all_words.append(w)\n",
" \n",
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0ee9af7d",
"metadata": {},
"outputs": [],
"source": [
"final_wordlist = []\n",
"seen_lemmatizations = set()\n",
"for w in all_words:\n",
" lemmatized = lemmatize_mappings.get(w) or w\n",
" if lemmatized in seen_lemmatizations:\n",
" # The lemmatized version of this word was already seen\n",
" continue\n",
" else:\n",
" # The lemmatized version hasn't been seen. We're good to add it\n",
" final_wordlist.append([\n",
" k\n",
" for k\n",
" in lemmatize_mappings.keys()\n",
" if lemmatize_mappings[k] == lemmatized\n",
" ])\n",
" seen_lemmatizations.add(lemmatized)\n",
"\n",
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
" break\n",
"\n",
"# Now, convert it to the format (number, word)\n",
"final_wordlist = [\n",
" (idx, w)\n",
" for idx, words in enumerate(final_wordlist)\n",
" for w in words\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "07c1293c",
"metadata": {},
"outputs": [],
"source": [
"print(len(lemmatize_mappings))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19c255d0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,159 +0,0 @@
#!/usr/bin/env python3
# coding: utf-8
print("Step 1")
try:
_initialized
except:
# !pip install spacy
# !python -m spacy download en_core_web_trf
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
_initialized=True
import pandas as pd
import gzip
import re
print("Step 2")
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")
print("Step 3")
annotated_words=pd.read_excel("annotated_words.ods")
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
print("Step 4")
# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Add all custom mappings
for m in list(sum(custom_maps, ())):
if m[0] not in all_words:
all_words.append(m[0])
if m[1] not in all_words:
all_words.append(m[1])
print("Step 5")
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
print(lemmatize_mappings[:100])
# Add custom lemmatizations
for l in custom_maps:
if l in lemmatize_mappings:
print(f"Warning: {l} is already lemmatized")
else:
lemmatize_mappings.append(l)
print(lemmatize_mappings[:100])
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
print(lemmatize_mappings[:100])
# Now, re-add all lematized words to the list of every word
for w in sum(lemmatize_mappings, ()):
if w not in all_words:
print(w)
all_words.append(w)
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
print("Step 6")
final_wordlist = []
seen_lemmatizations = set()
for w in all_words:
lemmatized = lemmatize_mappings.get(w) or w
if lemmatized in seen_lemmatizations:
# The lemmatized version of this word was already seen
continue
else:
# The lemmatized version hasn't been seen. We're good to add it
final_wordlist.append([
k
for k
in lemmatize_mappings.keys()
if lemmatize_mappings[k] == lemmatized
])
seen_lemmatizations.add(lemmatized)
if len(final_wordlist) >= WORDLIST_SIZE:
break
# Now, convert it to the format (number, word)
final_wordlist = [
(idx, w)
for idx, words in enumerate(final_wordlist)
for w in words
]
print("Step 7")
print(len(lemmatize_mappings))
print("Step 8")
with open("01-generated-wordlist.csv", "w") as f:
f.write("word,number\n")
for w in final_wordlist:
lemmatized = "" if not w[1] else w[1]
f.write(f"{w[1].upper()},{w[0]}")
f.write("\n")
print("Done")