Change fasttext-wheel to fasttext in requirements.txt#470
Change fasttext-wheel to fasttext in requirements.txt#470SunnyHaze merged 2 commits intoOpenDCAI:mainfrom
Conversation
|
Here's the diff between the two dependency versions, if you want to review: diff -u -r fasttext-wheel-0.9.2/PKG-INFO fasttext-0.9.3/PKG-INFO
--- fasttext-wheel-0.9.2/PKG-INFO 2023-05-05 07:58:47.102100400 -0400
+++ fasttext-0.9.3/PKG-INFO 2024-06-12 05:28:01.705583800 -0400
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
-Name: fasttext-wheel
-Version: 0.9.2
+Name: fasttext
+Version: 0.9.3
Summary: fasttext Python bindings
Home-page: https://github.com/facebookresearch/fastText
Author: Onur Celebi
@@ -21,6 +21,9 @@
Classifier: Operating System :: Unix
Classifier: Operating System :: MacOS
License-File: LICENSE
+Requires-Dist: pybind11>=2.2
+Requires-Dist: setuptools>=0.7.0
+Requires-Dist: numpy
fastText |CircleCI|
===================
diff -u -r fasttext-wheel-0.9.2/python/fasttext_module/fasttext/FastText.py fasttext-0.9.3/python/fasttext_module/fasttext/FastText.py
--- fasttext-wheel-0.9.2/python/fasttext_module/fasttext/FastText.py 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/python/fasttext_module/fasttext/FastText.py 2024-06-12 04:55:55.000000000 -0400
@@ -12,7 +12,6 @@
import fasttext_pybind as fasttext
import numpy as np
import multiprocessing
-import sys
from itertools import chain
loss_name = fasttext.loss_name
@@ -24,11 +23,7 @@
displayed_errors = {}
-def eprint(*args, **kwargs):
- print(*args, file=sys.stderr, **kwargs)
-
-
-class _Meter(object):
+class _Meter:
def __init__(self, fasttext_model, meter):
self.f = fasttext_model
self.m = meter
@@ -81,7 +76,7 @@
return recall
-class _FastText(object):
+class _FastText:
"""
This class defines the API to inspect models and should not be used to
create objects. It will be returned by functions such as load_model or
@@ -102,10 +97,26 @@
def set_args(self, args=None):
if args:
- arg_names = ['lr', 'dim', 'ws', 'epoch', 'minCount',
- 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams',
- 'loss', 'bucket', 'thread', 'lrUpdateRate', 't',
- 'label', 'verbose', 'pretrainedVectors']
+ arg_names = [
+ "lr",
+ "dim",
+ "ws",
+ "epoch",
+ "minCount",
+ "minCountLabel",
+ "minn",
+ "maxn",
+ "neg",
+ "wordNgrams",
+ "loss",
+ "bucket",
+ "thread",
+ "lrUpdateRate",
+ "t",
+ "label",
+ "verbose",
+ "pretrainedVectors",
+ ]
for arg_name in arg_names:
setattr(self, arg_name, getattr(args, arg_name))
@@ -131,21 +142,18 @@
whitespace (space, newline, tab, vertical tab) and the control
characters carriage return, formfeed and the null character.
"""
- if text.find('\n') != -1:
- raise ValueError(
- "predict processes one line at a time (remove \'\\n\')"
- )
+ if text.find("\n") != -1:
+ raise ValueError("predict processes one line at a time (remove '\\n')")
text += "\n"
dim = self.get_dimension()
b = fasttext.Vector(dim)
self.f.getSentenceVector(b, text)
return np.array(b)
- def get_nearest_neighbors(self, word, k=10, on_unicode_error='strict'):
+ def get_nearest_neighbors(self, word, k=10, on_unicode_error="strict"):
return self.f.getNN(word, k, on_unicode_error)
- def get_analogies(self, wordA, wordB, wordC, k=10,
- on_unicode_error='strict'):
+ def get_analogies(self, wordA, wordB, wordC, k=10, on_unicode_error="strict"):
return self.f.getAnalogies(wordA, wordB, wordC, k, on_unicode_error)
def get_word_id(self, word):
@@ -168,7 +176,7 @@
"""
return self.f.getSubwordId(subword)
- def get_subwords(self, word, on_unicode_error='strict'):
+ def get_subwords(self, word, on_unicode_error="strict"):
"""
Given a word, get the subwords and their indicies.
"""
@@ -184,7 +192,7 @@
self.f.getInputVector(b, ind)
return np.array(b)
- def predict(self, text, k=1, threshold=0.0, on_unicode_error='strict'):
+ def predict(self, text, k=1, threshold=0.0, on_unicode_error="strict"):
"""
Given a string, get a list of labels and a list of
corresponding probabilities. k controls the number
@@ -208,17 +216,16 @@
"""
def check(entry):
- if entry.find('\n') != -1:
- raise ValueError(
- "predict processes one line at a time (remove \'\\n\')"
- )
+ if entry.find("\n") != -1:
+ raise ValueError("predict processes one line at a time (remove '\\n')")
entry += "\n"
return entry
if type(text) == list:
text = [check(entry) for entry in text]
all_labels, all_probs = self.f.multilinePredict(
- text, k, threshold, on_unicode_error)
+ text, k, threshold, on_unicode_error
+ )
return all_labels, all_probs
else:
@@ -249,7 +256,7 @@
raise ValueError("Can't get quantized Matrix")
return np.array(self.f.getOutputMatrix())
- def get_words(self, include_freq=False, on_unicode_error='strict'):
+ def get_words(self, include_freq=False, on_unicode_error="strict"):
"""
Get the entire list of words of the dictionary optionally
including the frequency of the individual words. This
@@ -262,7 +269,7 @@
else:
return pair[0]
- def get_labels(self, include_freq=False, on_unicode_error='strict'):
+ def get_labels(self, include_freq=False, on_unicode_error="strict"):
"""
Get the entire list of labels of the dictionary optionally
including the frequency of the individual labels. Unsupervised
@@ -280,17 +287,15 @@
else:
return self.get_words(include_freq)
- def get_line(self, text, on_unicode_error='strict'):
+ def get_line(self, text, on_unicode_error="strict"):
"""
Split a line of text into words and labels. Labels must start with
the prefix used to create the model (__label__ by default).
"""
def check(entry):
- if entry.find('\n') != -1:
- raise ValueError(
- "get_line processes one line at a time (remove \'\\n\')"
- )
+ if entry.find("\n") != -1:
+ raise ValueError("get_line processes one line at a time (remove '\\n')")
entry += "\n"
return entry
@@ -336,7 +341,7 @@
thread=None,
verbose=None,
dsub=2,
- qnorm=False
+ qnorm=False,
):
"""
Quantize the model reducing the size of the model and
@@ -356,8 +361,7 @@
if input is None:
input = ""
self.f.quantize(
- input, qout, cutoff, retrain, epoch, lr, thread, verbose, dsub,
- qnorm
+ input, qout, cutoff, retrain, epoch, lr, thread, verbose, dsub, qnorm
)
def set_matrices(self, input_matrix, output_matrix):
@@ -365,8 +369,9 @@
Set input and output matrices. This function assumes you know what you
are doing.
"""
- self.f.setMatrices(input_matrix.astype(np.float32),
- output_matrix.astype(np.float32))
+ self.f.setMatrices(
+ input_matrix.astype(np.float32), output_matrix.astype(np.float32)
+ )
@property
def words(self):
@@ -437,46 +442,45 @@
def load_model(path):
"""Load a model given a filepath and return a model object."""
- eprint("Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.")
return _FastText(model_path=path)
unsupervised_default = {
- 'model': "skipgram",
- 'lr': 0.05,
- 'dim': 100,
- 'ws': 5,
- 'epoch': 5,
- 'minCount': 5,
- 'minCountLabel': 0,
- 'minn': 3,
- 'maxn': 6,
- 'neg': 5,
- 'wordNgrams': 1,
- 'loss': "ns",
- 'bucket': 2000000,
- 'thread': multiprocessing.cpu_count() - 1,
- 'lrUpdateRate': 100,
- 't': 1e-4,
- 'label': "__label__",
- 'verbose': 2,
- 'pretrainedVectors': "",
- 'seed': 0,
- 'autotuneValidationFile': "",
- 'autotuneMetric': "f1",
- 'autotunePredictions': 1,
- 'autotuneDuration': 60 * 5, # 5 minutes
- 'autotuneModelSize': ""
+ "model": "skipgram",
+ "lr": 0.05,
+ "dim": 100,
+ "ws": 5,
+ "epoch": 5,
+ "minCount": 5,
+ "minCountLabel": 0,
+ "minn": 3,
+ "maxn": 6,
+ "neg": 5,
+ "wordNgrams": 1,
+ "loss": "ns",
+ "bucket": 2000000,
+ "thread": multiprocessing.cpu_count() - 1,
+ "lrUpdateRate": 100,
+ "t": 1e-4,
+ "label": "__label__",
+ "verbose": 2,
+ "pretrainedVectors": "",
+ "seed": 0,
+ "autotuneValidationFile": "",
+ "autotuneMetric": "f1",
+ "autotunePredictions": 1,
+ "autotuneDuration": 60 * 5, # 5 minutes
+ "autotuneModelSize": "",
}
def read_args(arg_list, arg_dict, arg_names, default_values):
param_map = {
- 'min_count': 'minCount',
- 'word_ngrams': 'wordNgrams',
- 'lr_update_rate': 'lrUpdateRate',
- 'label_prefix': 'label',
- 'pretrained_vectors': 'pretrainedVectors'
+ "min_count": "minCount",
+ "word_ngrams": "wordNgrams",
+ "lr_update_rate": "lrUpdateRate",
+ "label_prefix": "label",
+ "pretrained_vectors": "pretrainedVectors",
}
ret = {}
@@ -512,22 +516,45 @@
repository such as the dataset pulled by classification-example.sh.
"""
supervised_default = unsupervised_default.copy()
- supervised_default.update({
- 'lr': 0.1,
- 'minCount': 1,
- 'minn': 0,
- 'maxn': 0,
- 'loss': "softmax",
- 'model': "supervised"
- })
-
- arg_names = ['input', 'lr', 'dim', 'ws', 'epoch', 'minCount',
- 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket',
- 'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors',
- 'seed', 'autotuneValidationFile', 'autotuneMetric',
- 'autotunePredictions', 'autotuneDuration', 'autotuneModelSize']
- args, manually_set_args = read_args(kargs, kwargs, arg_names,
- supervised_default)
+ supervised_default.update(
+ {
+ "lr": 0.1,
+ "minCount": 1,
+ "minn": 0,
+ "maxn": 0,
+ "loss": "softmax",
+ "model": "supervised",
+ }
+ )
+
+ arg_names = [
+ "input",
+ "lr",
+ "dim",
+ "ws",
+ "epoch",
+ "minCount",
+ "minCountLabel",
+ "minn",
+ "maxn",
+ "neg",
+ "wordNgrams",
+ "loss",
+ "bucket",
+ "thread",
+ "lrUpdateRate",
+ "t",
+ "label",
+ "verbose",
+ "pretrainedVectors",
+ "seed",
+ "autotuneValidationFile",
+ "autotuneMetric",
+ "autotunePredictions",
+ "autotuneDuration",
+ "autotuneModelSize",
+ ]
+ args, manually_set_args = read_args(kargs, kwargs, arg_names, supervised_default)
a = _build_args(args, manually_set_args)
ft = _FastText(args=a)
fasttext.train(ft.f, a)
@@ -549,11 +576,29 @@
dataset pulled by the example script word-vector-example.sh, which is
part of the fastText repository.
"""
- arg_names = ['input', 'model', 'lr', 'dim', 'ws', 'epoch', 'minCount',
- 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket',
- 'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors']
- args, manually_set_args = read_args(kargs, kwargs, arg_names,
- unsupervised_default)
+ arg_names = [
+ "input",
+ "model",
+ "lr",
+ "dim",
+ "ws",
+ "epoch",
+ "minCount",
+ "minCountLabel",
+ "minn",
+ "maxn",
+ "neg",
+ "wordNgrams",
+ "loss",
+ "bucket",
+ "thread",
+ "lrUpdateRate",
+ "t",
+ "label",
+ "verbose",
+ "pretrainedVectors",
+ ]
+ args, manually_set_args = read_args(kargs, kwargs, arg_names, unsupervised_default)
a = _build_args(args, manually_set_args)
ft = _FastText(args=a)
fasttext.train(ft.f, a)
@@ -562,12 +607,18 @@
def cbow(*kargs, **kwargs):
- raise Exception("`cbow` is not supported any more. Please use `train_unsupervised` with model=`cbow`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module")
+ raise Exception(
+ "`cbow` is not supported any more. Please use `train_unsupervised` with model=`cbow`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module"
+ )
def skipgram(*kargs, **kwargs):
- raise Exception("`skipgram` is not supported any more. Please use `train_unsupervised` with model=`skipgram`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module")
+ raise Exception(
+ "`skipgram` is not supported any more. Please use `train_unsupervised` with model=`skipgram`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module"
+ )
def supervised(*kargs, **kwargs):
- raise Exception("`supervised` is not supported any more. Please use `train_supervised`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module")
+ raise Exception(
+ "`supervised` is not supported any more. Please use `train_supervised`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module"
+ )
diff -u -r fasttext-wheel-0.9.2/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc fasttext-0.9.3/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc
--- fasttext-wheel-0.9.2/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc 2024-06-12 04:55:55.000000000 -0400
@@ -78,8 +78,9 @@
} else if (type == fasttext::entry_type::label && wid >= 0) {
labels.push_back(castToPythonString(token, onUnicodeError));
}
- if (token == fasttext::Dictionary::EOS)
+ if (token == fasttext::Dictionary::EOS) {
break;
+}
}
return std::pair<std::vector<py::str>, std::vector<py::str>>(words, labels);
}
@@ -199,28 +200,29 @@
.def("scoreVsTrue", &fasttext::Meter::scoreVsTrue)
.def(
"precisionRecallCurveLabel",
- py::overload_cast<int32_t>(
- &fasttext::Meter::precisionRecallCurve, py::const_))
+ (std::vector<std::pair<double, double>>(fasttext::Meter::*)(int32_t)
+ const) &
+ fasttext::Meter::precisionRecallCurve)
.def(
"precisionRecallCurve",
- py::overload_cast<>(
- &fasttext::Meter::precisionRecallCurve, py::const_))
+ (std::vector<std::pair<double, double>>(fasttext::Meter::*)() const) &
+ fasttext::Meter::precisionRecallCurve)
.def(
"precisionAtRecallLabel",
- py::overload_cast<int32_t, double>(
- &fasttext::Meter::precisionAtRecall, py::const_))
+ (double (fasttext::Meter::*)(int32_t, double) const) &
+ fasttext::Meter::precisionAtRecall)
.def(
"precisionAtRecall",
- py::overload_cast<double>(
- &fasttext::Meter::precisionAtRecall, py::const_))
+ (double (fasttext::Meter::*)(double) const) &
+ fasttext::Meter::precisionAtRecall)
.def(
"recallAtPrecisionLabel",
- py::overload_cast<int32_t, double>(
- &fasttext::Meter::recallAtPrecision, py::const_))
+ (double (fasttext::Meter::*)(int32_t, double) const) &
+ fasttext::Meter::recallAtPrecision)
.def(
"recallAtPrecision",
- py::overload_cast<double>(
- &fasttext::Meter::recallAtPrecision, py::const_));
+ (double (fasttext::Meter::*)(double) const) &
+ fasttext::Meter::recallAtPrecision);
py::class_<fasttext::FastText>(m, "fasttext")
.def(py::init<>())
Only in fasttext-0.9.3/python/fasttext_module: fasttext.egg-info
Only in fasttext-wheel-0.9.2/python/fasttext_module: fasttext_wheel.egg-info
diff -u -r fasttext-wheel-0.9.2/setup.py fasttext-0.9.3/setup.py
--- fasttext-wheel-0.9.2/setup.py 2023-05-05 07:58:44.000000000 -0400
+++ fasttext-0.9.3/setup.py 2024-06-12 05:27:28.000000000 -0400
@@ -21,33 +21,36 @@
import platform
import io
-__version__ = '0.9.2'
+__version__ = "0.9.3"
FASTTEXT_SRC = "src"
# Based on https://github.com/pybind/python_example
-class get_pybind_include(object):
+
+class get_pybind_include:
"""Helper class to determine the pybind11 include path
The purpose of this class is to postpone importing pybind11
until it is actually installed, so that the ``get_include()``
- method can be invoked. """
+ method can be invoked."""
def __init__(self, user=False):
try:
- import pybind11
+ pass
except ImportError:
- if subprocess.call([sys.executable, '-m', 'pip', 'install', 'pybind11']):
- raise RuntimeError('pybind11 install failed.')
+ if subprocess.call([sys.executable, "-m", "pip", "install", "pybind11"]):
+ raise RuntimeError("pybind11 install failed.")
self.user = user
def __str__(self):
import pybind11
+
return pybind11.get_include(self.user)
+
try:
- coverage_index = sys.argv.index('--coverage')
+ coverage_index = sys.argv.index("--coverage")
except ValueError:
coverage = False
else:
@@ -55,7 +58,7 @@
coverage = True
fasttext_src_files = map(str, os.listdir(FASTTEXT_SRC))
-fasttext_src_cc = list(filter(lambda x: x.endswith('.cc'), fasttext_src_files))
+fasttext_src_cc = list(filter(lambda x: x.endswith(".cc"), fasttext_src_files))
fasttext_src_cc = list(
map(lambda x: str(os.path.join(FASTTEXT_SRC, x)), fasttext_src_cc)
@@ -63,10 +66,11 @@
ext_modules = [
Extension(
- str('fasttext_pybind'),
+ str("fasttext_pybind"),
[
- str('python/fasttext_module/fasttext/pybind/fasttext_pybind.cc'),
- ] + fasttext_src_cc,
+ str("python/fasttext_module/fasttext/pybind/fasttext_pybind.cc"),
+ ]
+ + fasttext_src_cc,
include_dirs=[
# Path to pybind11 headers
get_pybind_include(),
@@ -74,9 +78,12 @@
# Path to fasttext source code
FASTTEXT_SRC,
],
- language='c++',
- extra_compile_args=["-O0 -fno-inline -fprofile-arcs -pthread -march=native" if coverage else
- "-O3 -funroll-loops -pthread -march=native"],
+ language="c++",
+ extra_compile_args=[
+ "-O0 -fno-inline -fprofile-arcs -pthread -march=native"
+ if coverage
+ else "-O3 -funroll-loops -pthread -march=native"
+ ],
),
]
@@ -88,8 +95,9 @@
the specified compiler.
"""
import tempfile
- with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f:
- f.write('int main (int argc, char **argv) { return 0; }')
+
+ with tempfile.NamedTemporaryFile("w", suffix=".cpp") as f:
+ f.write("int main (int argc, char **argv) { return 0; }")
try:
compiler.compile([f.name], extra_postargs=flags)
except setuptools.distutils.errors.CompileError:
@@ -98,58 +106,53 @@
def cpp_flag(compiler):
- """Return the -std=c++[11/14] compiler flag.
- The c++14 is preferred over c++11 (when it is available).
- """
- standards = ['-std=c++14', '-std=c++11']
+ """Return the -std=c++17 compiler flag."""
+ standards = ["-std=c++17"]
for standard in standards:
if has_flag(compiler, [standard]):
return standard
- raise RuntimeError(
- 'Unsupported compiler -- at least C++11 support '
- 'is needed!'
- )
+ raise RuntimeError("Unsupported compiler -- at least C++17 support " "is needed!")
class BuildExt(build_ext):
"""A custom build extension for adding compiler-specific options."""
+
c_opts = {
- 'msvc': ['/EHsc'],
- 'unix': [],
+ "msvc": ["/EHsc"],
+ "unix": [],
}
def build_extensions(self):
- if sys.platform == 'darwin':
- mac_osx_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
- os.environ['MACOSX_DEPLOYMENT_TARGET'] = str(mac_osx_version)
- all_flags = ['-stdlib=libc++', '-mmacosx-version-min=10.7']
+ if sys.platform == "darwin":
+ mac_osx_version = float(".".join(platform.mac_ver()[0].split(".")[:2]))
+ os.environ["MACOSX_DEPLOYMENT_TARGET"] = str(mac_osx_version)
+ all_flags = ["-stdlib=libc++", "-mmacosx-version-min=10.7"]
if has_flag(self.compiler, [all_flags[0]]):
- self.c_opts['unix'] += [all_flags[0]]
+ self.c_opts["unix"] += [all_flags[0]]
elif has_flag(self.compiler, all_flags):
- self.c_opts['unix'] += all_flags
+ self.c_opts["unix"] += all_flags
else:
raise RuntimeError(
- 'libc++ is needed! Failed to compile with {} and {}.'.
- format(" ".join(all_flags), all_flags[0])
+ "libc++ is needed! Failed to compile with {} and {}.".format(
+ " ".join(all_flags), all_flags[0]
+ )
)
ct = self.compiler.compiler_type
opts = self.c_opts.get(ct, [])
extra_link_args = []
if coverage:
- coverage_option = '--coverage'
+ coverage_option = "--coverage"
opts.append(coverage_option)
extra_link_args.append(coverage_option)
- if ct == 'unix':
+ if ct == "unix":
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
opts.append(cpp_flag(self.compiler))
- if has_flag(self.compiler, ['-fvisibility=hidden']):
- opts.append('-fvisibility=hidden')
- elif ct == 'msvc':
- opts.append(
- '/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()
- )
+ if has_flag(self.compiler, ["-fvisibility=hidden"]):
+ opts.append("-fvisibility=hidden")
+ elif ct == "msvc":
+ opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
for ext in self.extensions:
ext.extra_compile_args = opts
ext.extra_link_args = extra_link_args
@@ -161,43 +164,43 @@
Use pandoc to generate rst from md.
pandoc --from=markdown --to=rst --output=python/README.rst python/README.md
"""
- with io.open("python/README.rst", encoding='utf-8') as fid:
+ with io.open("python/README.rst", encoding="utf-8") as fid:
return fid.read()
setup(
- name='fasttext-wheel',
+ name="fasttext",
version=__version__,
- author='Onur Celebi',
- author_email='celebio@fb.com',
- description='fasttext Python bindings',
+ author="Onur Celebi",
+ author_email="celebio@fb.com",
+ description="fasttext Python bindings",
long_description=_get_readme(),
ext_modules=ext_modules,
- url='https://github.com/facebookresearch/fastText',
- license='MIT',
+ url="https://github.com/facebookresearch/fastText",
+ license="MIT",
classifiers=[
- 'Development Status :: 3 - Alpha',
- 'Intended Audience :: Developers',
- 'Intended Audience :: Science/Research',
- 'License :: OSI Approved :: MIT License',
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3.4',
- 'Programming Language :: Python :: 3.5',
- 'Programming Language :: Python :: 3.6',
- 'Topic :: Software Development',
- 'Topic :: Scientific/Engineering',
- 'Operating System :: Microsoft :: Windows',
- 'Operating System :: POSIX',
- 'Operating System :: Unix',
- 'Operating System :: MacOS',
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3.4",
+ "Programming Language :: Python :: 3.5",
+ "Programming Language :: Python :: 3.6",
+ "Topic :: Software Development",
+ "Topic :: Scientific/Engineering",
+ "Operating System :: Microsoft :: Windows",
+ "Operating System :: POSIX",
+ "Operating System :: Unix",
+ "Operating System :: MacOS",
],
- install_requires=['pybind11>=2.2', "setuptools >= 0.7.0", "numpy"],
- cmdclass={'build_ext': BuildExt},
+ install_requires=["pybind11>=2.2", "setuptools >= 0.7.0", "numpy"],
+ cmdclass={"build_ext": BuildExt},
packages=[
- str('fasttext'),
- str('fasttext.util'),
- str('fasttext.tests'),
+ str("fasttext"),
+ str("fasttext.util"),
+ str("fasttext.tests"),
],
- package_dir={str(''): str('python/fasttext_module')},
+ package_dir={str(""): str("python/fasttext_module")},
zip_safe=False,
)
Only in fasttext-0.9.3/src: aligned.h
diff -u -r fasttext-wheel-0.9.2/src/args.cc fasttext-0.9.3/src/args.cc
--- fasttext-wheel-0.9.2/src/args.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/args.cc 2024-06-12 04:55:55.000000000 -0400
@@ -8,7 +8,8 @@
#include "args.h"
-#include <stdlib.h>
+#include <cstdlib>
+#include <cstdint>
#include <iostream>
#include <stdexcept>
@@ -106,7 +107,7 @@
}
void Args::parseArgs(const std::vector<std::string>& args) {
- std::string command(args[1]);
+ const std::string& command(args[1]);
if (command == "supervised") {
model = model_name::sup;
loss = loss_name::softmax;
@@ -401,13 +402,13 @@
} else if (autotuneMetric == "f1") {
return metric_name::f1score;
} else if (autotuneMetric.substr(0, 18) == "precisionAtRecall:") {
- size_t semicolon = autotuneMetric.find(":", 18);
+ size_t semicolon = autotuneMetric.find(':', 18);
if (semicolon != std::string::npos) {
return metric_name::precisionAtRecallLabel;
}
return metric_name::precisionAtRecall;
} else if (autotuneMetric.substr(0, 18) == "recallAtPrecision:") {
- size_t semicolon = autotuneMetric.find(":", 18);
+ size_t semicolon = autotuneMetric.find(':', 18);
if (semicolon != std::string::npos) {
return metric_name::recallAtPrecisionLabel;
}
@@ -424,7 +425,7 @@
} else if (
metric == metric_name::precisionAtRecallLabel ||
metric == metric_name::recallAtPrecisionLabel) {
- size_t semicolon = autotuneMetric.find(":", 18);
+ size_t semicolon = autotuneMetric.find(':', 18);
label = autotuneMetric.substr(semicolon + 1);
} else {
return label;
@@ -444,7 +445,7 @@
metric == metric_name::recallAtPrecisionLabel ||
metric == metric_name::recallAtPrecision) {
size_t firstSemicolon = 18; // semicolon position in "precisionAtRecall:"
- size_t secondSemicolon = autotuneMetric.find(":", firstSemicolon);
+ size_t secondSemicolon = autotuneMetric.find(':', firstSemicolon);
const std::string valueStr =
autotuneMetric.substr(firstSemicolon, secondSemicolon - firstSemicolon);
value = std::stof(valueStr) / 100.0;
diff -u -r fasttext-wheel-0.9.2/src/densematrix.cc fasttext-0.9.3/src/densematrix.cc
--- fasttext-wheel-0.9.2/src/densematrix.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/densematrix.cc 2024-06-12 04:55:55.000000000 -0400
@@ -15,6 +15,10 @@
#include "utils.h"
#include "vector.h"
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+#include <immintrin.h>
+#endif
+
namespace fasttext {
DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {}
@@ -146,6 +150,92 @@
}
}
+/* Abstract over AVX512F, AVX, and SSE intrinsics, using the one available on this machine. */
+#if defined(__AVX512F__)
+using Register = __m512;
+inline Register Add(Register first, Register second) { return _mm512_add_ps(first, second); }
+inline Register Set1(float to) { return _mm512_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm512_mul_ps(first, second); }
+#elif defined(__AVX__)
+using Register = __m256;
+inline Register Add(Register first, Register second) { return _mm256_add_ps(first, second); }
+inline Register Set1(float to) { return _mm256_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm256_mul_ps(first, second); }
+#elif defined(__SSE__)
+using Register = __m128;
+inline Register Add(Register first, Register second) { return _mm_add_ps(first, second); }
+inline Register Set1(float to) { return _mm_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm_mul_ps(first, second); }
+#endif
+
+/* Faster routine for averaging rows of a matrix on x86.
+ * The idea here is to keep the accumulators in registers if possible. */
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+template <unsigned Cols> void averageRowsFast(Vector& x, const std::vector<int32_t>& rows, const DenseMatrix &matrix) {
+ // Columns must be a multiple of how many floats fit in a register.
+ static_assert(Cols % (sizeof(Register) / 4) == 0);
+ constexpr unsigned RegisterCount = Cols / (sizeof(Register) / 4);
+ // These should be aligned by aligned.h
+ assert(reinterpret_cast<uintptr_t>(x.data()) % sizeof(Register) == 0);
+ assert(reinterpret_cast<uintptr_t>(matrix.data()) % sizeof(Register) == 0);
+
+ // Guard against empty list of rows with default NaN behavior.
+ if (rows.empty()) {
+ x.zero();
+ x.mul(1.0 / rows.size());
+ return;
+ }
+
+ // Copy the first row to accumulation registers.
+ Register accum[RegisterCount];
+ auto row = rows.cbegin();
+ const Register *base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
+ for (unsigned i = 0; i < RegisterCount; ++i) {
+ accum[i] = base[i];
+ }
+ // Add the rows after the first.
+ for (++row; row != rows.cend(); ++row) {
+ base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
+ for (unsigned i = 0; i < RegisterCount; ++i) {
+ accum[i] = Add(accum[i], base[i]);
+ }
+ }
+ // Multiply by (1.0 / rows.size()) and write to x.
+ Register mul = Set1(1.0 / rows.size());
+ for (unsigned i = 0; i < RegisterCount; ++i) {
+ reinterpret_cast<Register*>(x.data())[i] = Multiply(accum[i], mul);
+ }
+}
+#endif
+
+void DenseMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+ switch (cols()) {
+ case 512:
+ // Maximum number that can fit all in registers on AVX512F.
+ averageRowsFast<512>(x, rows, *this);
+ return;
+ case 256:
+ averageRowsFast<256>(x, rows, *this);
+ return;
+ case 64:
+ averageRowsFast<64>(x, rows, *this);
+ return;
+ case 32:
+ averageRowsFast<32>(x, rows, *this);
+ return;
+ case 16:
+ averageRowsFast<16>(x, rows, *this);
+ return;
+ }
+#endif
+ x.zero();
+ for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
+ addRowToVector(x, *it);
+ }
+ x.mul(1.0 / rows.size());
+}
+
void DenseMatrix::save(std::ostream& out) const {
out.write((char*)&m_, sizeof(int64_t));
out.write((char*)&n_, sizeof(int64_t));
@@ -155,7 +245,7 @@
void DenseMatrix::load(std::istream& in) {
in.read((char*)&m_, sizeof(int64_t));
in.read((char*)&n_, sizeof(int64_t));
- data_ = std::vector<real>(m_ * n_);
+ data_ = intgemm::AlignedVector<real>(m_ * n_);
in.read((char*)data_.data(), m_ * n_ * sizeof(real));
}
diff -u -r fasttext-wheel-0.9.2/src/densematrix.h fasttext-0.9.3/src/densematrix.h
--- fasttext-wheel-0.9.2/src/densematrix.h 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/densematrix.h 2024-06-12 04:55:55.000000000 -0400
@@ -15,6 +15,7 @@
#include <stdexcept>
#include <vector>
+#include "aligned.h"
#include "matrix.h"
#include "real.h"
@@ -24,7 +25,7 @@
class DenseMatrix : public Matrix {
protected:
- std::vector<real> data_;
+ intgemm::AlignedVector<real> data_;
void uniformThread(real, int, int32_t);
public:
@@ -71,6 +72,7 @@
void addVectorToRow(const Vector&, int64_t, real) override;
void addRowToVector(Vector& x, int32_t i) const override;
void addRowToVector(Vector& x, int32_t i, real a) const override;
+ void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
void save(std::ostream&) const override;
void load(std::istream&) override;
void dump(std::ostream&) const override;
diff -u -r fasttext-wheel-0.9.2/src/dictionary.cc fasttext-0.9.3/src/dictionary.cc
--- fasttext-wheel-0.9.2/src/dictionary.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/dictionary.cc 2024-06-12 04:55:55.000000000 -0400
@@ -42,11 +42,11 @@
load(in);
}
-int32_t Dictionary::find(const std::string& w) const {
+int32_t Dictionary::find(const std::string_view w) const {
return find(w, hash(w));
}
-int32_t Dictionary::find(const std::string& w, uint32_t h) const {
+int32_t Dictionary::find(const std::string_view w, uint32_t h) const {
int32_t word2intsize = word2int_.size();
int32_t id = h % word2intsize;
while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
@@ -126,12 +126,12 @@
return rand > pdiscard_[id];
}
-int32_t Dictionary::getId(const std::string& w, uint32_t h) const {
+int32_t Dictionary::getId(const std::string_view w, uint32_t h) const {
int32_t id = find(w, h);
return word2int_[id];
}
-int32_t Dictionary::getId(const std::string& w) const {
+int32_t Dictionary::getId(const std::string_view w) const {
int32_t h = find(w);
return word2int_[h];
}
@@ -142,7 +142,7 @@
return words_[id].type;
}
-entry_type Dictionary::getType(const std::string& w) const {
+entry_type Dictionary::getType(const std::string_view w) const {
return (w.find(args_->label) == 0) ? entry_type::label : entry_type::word;
}
@@ -160,7 +160,7 @@
// Since all fasttext models that were already released were trained
// using signed char, we fixed the hash function to make models
// compatible whatever compiler is used.
-uint32_t Dictionary::hash(const std::string& str) const {
+uint32_t Dictionary::hash(const std::string_view str) const {
uint32_t h = 2166136261;
for (size_t i = 0; i < str.size(); i++) {
h = h ^ uint32_t(int8_t(str[i]));
@@ -324,11 +324,16 @@
void Dictionary::addSubwords(
std::vector<int32_t>& line,
- const std::string& token,
+ const std::string_view token,
int32_t wid) const {
if (wid < 0) { // out of vocab
if (token != EOS) {
- computeSubwords(BOW + token + EOW, line);
+ std::string concat;
+ concat.reserve(BOW.size() + token.size() + EOW.size());
+ concat += BOW;
+ concat.append(token.data(), token.size());
+ concat += EOW;
+ computeSubwords(concat, line);
}
} else {
if (args_->maxn <= 0) { // in vocab w/o subwords
@@ -390,6 +395,51 @@
uint32_t h = hash(token);
int32_t wid = getId(token, h);
entry_type type = wid < 0 ? getType(token) : getType(wid);
+
+ ntokens++;
+ if (type == entry_type::word) {
+ addSubwords(words, token, wid);
+ word_hashes.push_back(h);
+ } else if (type == entry_type::label && wid >= 0) {
+ labels.push_back(wid - nwords_);
+ }
+ if (token == EOS) {
+ break;
+ }
+ }
+ addWordNgrams(words, word_hashes, args_->wordNgrams);
+ return ntokens;
+}
+
+namespace {
+bool readWordNoNewline(std::string_view& in, std::string_view& word) {
+ const std::string_view spaces(" \n\r\t\v\f\0");
+ std::string_view::size_type begin = in.find_first_not_of(spaces);
+ if (begin == std::string_view::npos) {
+ in.remove_prefix(in.size());
+ return false;
+ }
+ in.remove_prefix(begin);
+ word = in.substr(0, in.find_first_of(spaces));
+ in.remove_prefix(word.size());
+ return true;
+}
+} // namespace
+
+int32_t Dictionary::getStringNoNewline(
+ std::string_view in,
+ std::vector<int32_t>& words,
+ std::vector<int32_t>& labels) const {
+ std::vector<int32_t> word_hashes;
+ std::string_view token;
+ int32_t ntokens = 0;
+
+ words.clear();
+ labels.clear();
+ while (readWordNoNewline(in, token)) {
+ uint32_t h = hash(token);
+ int32_t wid = getId(token, h);
+ entry_type type = wid < 0 ? getType(token) : getType(wid);
ntokens++;
if (type == entry_type::word) {
diff -u -r fasttext-wheel-0.9.2/src/dictionary.h fasttext-0.9.3/src/dictionary.h
--- fasttext-wheel-0.9.2/src/dictionary.h 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/dictionary.h 2024-06-12 04:55:55.000000000 -0400
@@ -13,6 +13,7 @@
#include <ostream>
#include <random>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
@@ -36,13 +37,13 @@
static const int32_t MAX_VOCAB_SIZE = 30000000;
static const int32_t MAX_LINE_SIZE = 1024;
- int32_t find(const std::string&) const;
- int32_t find(const std::string&, uint32_t h) const;
+ int32_t find(const std::string_view) const;
+ int32_t find(const std::string_view, uint32_t h) const;
void initTableDiscard();
void initNgrams();
void reset(std::istream&) const;
void pushHash(std::vector<int32_t>&, int32_t) const;
- void addSubwords(std::vector<int32_t>&, const std::string&, int32_t) const;
+ void addSubwords(std::vector<int32_t>&, const std::string_view, int32_t) const;
std::shared_ptr<Args> args_;
std::vector<int32_t> word2int_;
@@ -71,10 +72,10 @@
int32_t nwords() const;
int32_t nlabels() const;
int64_t ntokens() const;
- int32_t getId(const std::string&) const;
- int32_t getId(const std::string&, uint32_t h) const;
+ int32_t getId(const std::string_view) const;
+ int32_t getId(const std::string_view, uint32_t h) const;
entry_type getType(int32_t) const;
- entry_type getType(const std::string&) const;
+ entry_type getType(const std::string_view) const;
bool discard(int32_t, real) const;
std::string getWord(int32_t) const;
const std::vector<int32_t>& getSubwords(int32_t) const;
@@ -87,7 +88,7 @@
const std::string&,
std::vector<int32_t>&,
std::vector<std::string>* substrings = nullptr) const;
- uint32_t hash(const std::string& str) const;
+ uint32_t hash(const std::string_view str) const;
void add(const std::string&);
bool readWord(std::istream&, std::string&) const;
void readFromFile(std::istream&);
@@ -99,6 +100,8 @@
const;
int32_t getLine(std::istream&, std::vector<int32_t>&, std::minstd_rand&)
const;
+ int32_t getStringNoNewline(std::string_view, std::vector<int32_t>&,
+ std::vector<int32_t>&) const;
void threshold(int64_t, int64_t);
void prune(std::vector<int32_t>&);
bool isPruned() {
diff -u -r fasttext-wheel-0.9.2/src/fasttext.cc fasttext-0.9.3/src/fasttext.cc
--- fasttext-wheel-0.9.2/src/fasttext.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/fasttext.cc 2024-06-12 04:55:55.000000000 -0400
@@ -532,7 +532,7 @@
if (ngrams[i] >= 0) {
vec.addRow(*input_, ngrams[i]);
}
- result.push_back(std::make_pair(substrings[i], std::move(vec)));
+ result.emplace_back(substrings[i], std::move(vec));
}
return result;
}
@@ -609,7 +609,7 @@
const std::string& wordA,
const std::string& wordB,
const std::string& wordC) {
- Vector query = Vector(args_->dim);
+ Vector query(args_->dim);
query.zero();
Vector buffer(args_->dim);
diff -u -r fasttext-wheel-0.9.2/src/matrix.h fasttext-0.9.3/src/matrix.h
--- fasttext-wheel-0.9.2/src/matrix.h 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/matrix.h 2024-06-12 04:55:55.000000000 -0400
@@ -36,6 +36,7 @@
virtual void addVectorToRow(const Vector&, int64_t, real) = 0;
virtual void addRowToVector(Vector& x, int32_t i) const = 0;
virtual void addRowToVector(Vector& x, int32_t i, real a) const = 0;
+ virtual void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const = 0;
virtual void save(std::ostream&) const = 0;
virtual void load(std::istream&) = 0;
virtual void dump(std::ostream&) const = 0;
diff -u -r fasttext-wheel-0.9.2/src/meter.cc fasttext-0.9.3/src/meter.cc
--- fasttext-wheel-0.9.2/src/meter.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/meter.cc 2024-06-12 04:55:55.000000000 -0400
@@ -39,9 +39,9 @@
labelMetrics_[prediction.second].scoreVsTrue.emplace_back(score, gold);
}
- if (falseNegativeLabels_) {
- for (const auto& label : labels) {
- labelMetrics_[label].gold++;
+ for (const auto& label : labels) {
+ labelMetrics_[label].gold++;
+ if (falseNegativeLabels_) {
if (!utils::containsSecond(predictions, label)) {
labelMetrics_[label].scoreVsTrue.emplace_back(falseNegativeScore, 1.0);
}
diff -u -r fasttext-wheel-0.9.2/src/model.cc fasttext-0.9.3/src/model.cc
--- fasttext-wheel-0.9.2/src/model.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/model.cc 2024-06-12 04:55:55.000000000 -0400
@@ -42,11 +42,7 @@
void Model::computeHidden(const std::vector<int32_t>& input, State& state)
const {
Vector& hidden = state.hidden;
- hidden.zero();
- for (auto it = input.cbegin(); it != input.cend(); ++it) {
- hidden.addRow(*wi_, *it);
- }
- hidden.mul(1.0 / input.size());
+ wi_->averageRowsToVector(hidden, input);
}
void Model::predict(
diff -u -r fasttext-wheel-0.9.2/src/quantmatrix.cc fasttext-0.9.3/src/quantmatrix.cc
--- fasttext-wheel-0.9.2/src/quantmatrix.cc 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/quantmatrix.cc 2024-06-12 04:55:55.000000000 -0400
@@ -80,6 +80,14 @@
pq_->addcode(x, codes_.data(), i, norm);
}
+void QuantMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
+ x.zero();
+ for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
+ addRowToVector(x, *it);
+ }
+ x.mul(1.0 / rows.size());
+}
+
void QuantMatrix::save(std::ostream& out) const {
out.write((char*)&qnorm_, sizeof(qnorm_));
out.write((char*)&m_, sizeof(m_));
diff -u -r fasttext-wheel-0.9.2/src/quantmatrix.h fasttext-0.9.3/src/quantmatrix.h
--- fasttext-wheel-0.9.2/src/quantmatrix.h 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/quantmatrix.h 2024-06-12 04:55:55.000000000 -0400
@@ -52,6 +52,7 @@
void addVectorToRow(const Vector&, int64_t, real) override;
void addRowToVector(Vector& x, int32_t i) const override;
void addRowToVector(Vector& x, int32_t i, real a) const override;
+ void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
void save(std::ostream&) const override;
void load(std::istream&) override;
void dump(std::ostream&) const override;
diff -u -r fasttext-wheel-0.9.2/src/vector.h fasttext-0.9.3/src/vector.h
--- fasttext-wheel-0.9.2/src/vector.h 2023-05-05 07:58:14.000000000 -0400
+++ fasttext-0.9.3/src/vector.h 2024-06-12 04:55:55.000000000 -0400
@@ -12,6 +12,7 @@
#include <ostream>
#include <vector>
+#include "aligned.h"
#include "real.h"
namespace fasttext {
@@ -20,12 +21,12 @@
class Vector {
protected:
- std::vector<real> data_;
+ intgemm::AlignedVector<real> data_;
public:
explicit Vector(int64_t);
Vector(const Vector&) = default;
- Vector(Vector&&) noexcept = default;
+ Vector(Vector&&) = default;
Vector& operator=(const Vector&) = default;
Vector& operator=(Vector&&) = default;
|
|
Hi, thanks for your interest in our project. Regarding this issue: as you may have noticed, installing For this PR, however, For example: # requirements.txt
...
fasttext; python_version >= "3.13"
fasttext-wheel; python_version < "3.13"
...This keeps the current default behavior for existing Python versions while enabling Python 3.13 support. |
5508c6c to
8305e8d
Compare
8305e8d to
94e6971
Compare
Ok thanks - I have re-pushed the change with the recommended fixes, they look like a great solution to me |
|
Hi, it should trigger a GitHub action CI here for automatic checking, but now I didn't see any signal about it. I am not sure if this is because of a |
|
OK i fixed the auto CI/CD issue by adding a commit, let's wait and see. If everything works well, I will merge this PR. Thanks! |
|
Test passed, looks good to me, thanks! |
The fastText project has been archived, and it looks like the latest version is now only in the
fasttextpackage in PyPI, notfasttext-wheel. This change switches to usingfasttext(0.9.3).The reason for this is because the wheel published as
fasttext-wheel==0.9.2doesn't build properly for me under Python 3.13, but the version hosted asfasttext==0.9.3does.