Title: | An Interface to the 'fastText' Library |
---|---|
Description: | An interface to the 'fastText' library <https://github.com/facebookresearch/fastText>. The package can be used for text classification and to learn word vectors. An example how to use 'fastTextR' can be found in the 'README' file. |
Authors: | Florian Schwendinger [aut], Emil Hvitfeldt [aut, cre] |
Maintainer: | Emil Hvitfeldt <[email protected]> |
License: | BSD_3_clause + file LICENSE |
Version: | 2.1.0.9000 |
Built: | 2024-11-06 03:26:28 UTC |
Source: | https://github.com/emilhvitfeldt/fasttextr |
FastText
ModelCreate a new FastText
model. The available methods
are the same as the package functions but with out the prefix "ft_"
and without the need to provide the model.
fasttext()
fasttext()
ft <- fasttext()
ft <- fasttext()
TODO
ft_analogies(model, word_triplets, k = 10L)
ft_analogies(model, word_triplets, k = 10L)
model |
an object inheriting from |
word_triplets |
a character vector of length string giving the word. |
k |
an integer giving the number of nearest neighbors to be returned. |
.
## Not run: ft_analogies(model, c("berlin", "germany", "france"), k = 6L) ## End(Not run)
## Not run: ft_analogies(model, c("berlin", "germany", "france"), k = 6L) ## End(Not run)
A auxiliary function for defining the control variables.
ft_control( loss = c("softmax", "hs", "ns"), learning_rate = 0.05, learn_update = 100L, word_vec_size = 100L, window_size = 5L, epoch = 5L, min_count = 5L, min_count_label = 0L, neg = 5L, max_len_ngram = 1L, nbuckets = 2000000L, min_ngram = 3L, max_ngram = 6L, nthreads = 1L, threshold = 1e-04, label = "__label__", verbose = 0, pretrained_vectors = "", output = "", save_output = FALSE, seed = 0L, qnorm = FALSE, retrain = FALSE, qout = FALSE, cutoff = 0L, dsub = 2L, autotune_validation_file = "", autotune_metric = "f1", autotune_predictions = 1L, autotune_duration = 300L, autotune_model_size = "" )
ft_control( loss = c("softmax", "hs", "ns"), learning_rate = 0.05, learn_update = 100L, word_vec_size = 100L, window_size = 5L, epoch = 5L, min_count = 5L, min_count_label = 0L, neg = 5L, max_len_ngram = 1L, nbuckets = 2000000L, min_ngram = 3L, max_ngram = 6L, nthreads = 1L, threshold = 1e-04, label = "__label__", verbose = 0, pretrained_vectors = "", output = "", save_output = FALSE, seed = 0L, qnorm = FALSE, retrain = FALSE, qout = FALSE, cutoff = 0L, dsub = 2L, autotune_validation_file = "", autotune_metric = "f1", autotune_predictions = 1L, autotune_duration = 300L, autotune_model_size = "" )
loss |
a character string giving the name of the loss function
allowed values are |
learning_rate |
a numeric giving the learning rate, the default value is |
learn_update |
an integer giving after how many tokens the learning rate
should be updated. The default value is |
word_vec_size |
an integer giving the length (size) of the word vectors. |
window_size |
an integer giving the size of the context window. |
epoch |
an integer giving the number of epochs. |
min_count |
an integer giving the minimal number of word occurences. |
min_count_label |
and integer giving the minimal number of label occurences. |
neg |
an integer giving how many negatives are sampled (only used if loss is |
max_len_ngram |
an integer giving the maximum length of ngrams used. |
nbuckets |
an integer giving the number of buckets. |
min_ngram |
an integer giving the minimal ngram length. |
max_ngram |
an integer giving the maximal ngram length. |
nthreads |
an integer giving the number of threads. |
threshold |
a numeric giving the sampling threshold. |
label |
a character string specifying the label prefix (default is |
verbose |
an integer giving the verbosity level, the default value
is |
pretrained_vectors |
a character string giving the file path to the pretrained word vectors which are used for the supervised learning. |
output |
a character string giving the output file path. |
save_output |
a logical (default is |
seed |
an integer |
qnorm |
a logical (default is |
retrain |
a logical (default is |
qout |
a logical (default is |
cutoff |
an integer (default is |
dsub |
an integer (default is |
autotune_validation_file |
a character string |
autotune_metric |
a character string (default is |
autotune_predictions |
an integer (default is |
autotune_duration |
an integer (default is |
autotune_model_size |
a character string |
a list with the control variables.
ft_control(learning_rate=0.1)
ft_control(learning_rate=0.1)
Load a previously saved model from file.
ft_load(file)
ft_load(file)
file |
a character string giving the name of the file to be read in. |
an object inheriting from "fasttext"
.
## Not run: model <- ft_load("dbpedia.bin") ## End(Not run)
## Not run: model <- ft_load("dbpedia.bin") ## End(Not run)
TODO
ft_nearest_neighbors(model, word, k = 10L)
ft_nearest_neighbors(model, word, k = 10L)
model |
an object inheriting from |
word |
a character string giving the word. |
k |
an integer giving the number of nearest neighbors to be returned. |
.
## Not run: ft_nearest_neighbors(model, "enviroment", k = 6L) ## End(Not run)
## Not run: ft_nearest_neighbors(model, "enviroment", k = 6L) ## End(Not run)
Applies normalization to a given text.
ft_normalize(txt)
ft_normalize(txt)
txt |
a character vector to be normalized. |
a character vector.
## Not run: ft_normalize(some_text) ## End(Not run)
## Not run: ft_normalize(some_text) ## End(Not run)
Write a previously saved model from file.
ft_save(model, file, what = c("model", "vectors", "output"))
ft_save(model, file, what = c("model", "vectors", "output"))
model |
an object inheriting from |
file |
a character string giving the name of the file. |
what |
a character string giving what should be saved. |
## Not run: ft_save(model, "my_model.bin", what = "model") ## End(Not run)
## Not run: ft_save(model, "my_model.bin", what = "model") ## End(Not run)
Obtain sentence vectors from a previously trained model.
ft_sentence_vectors(model, sentences)
ft_sentence_vectors(model, sentences)
model |
an object inheriting from |
sentences |
a character vector giving the sentences. |
a matrix containing the sentence vectors.
## Not run: ft_sentence_vectors(model, c("sentence", "vector")) ## End(Not run)
## Not run: ft_sentence_vectors(model, c("sentence", "vector")) ## End(Not run)
Evaluate the quality of the predictions. For the model evaluation precision and recall are used.
ft_test(model, file, k = 1L, threshold = 0)
ft_test(model, file, k = 1L, threshold = 0)
model |
an object inheriting from |
file |
a character string giving the location of the validation file. |
k |
an integer giving the number of labels to be returned. |
threshold |
a double giving the threshold. |
## Not run: ft_test(model, file) ## End(Not run)
## Not run: ft_test(model, file) ## End(Not run)
Train a new word representation model or supervised classification model.
ft_train( file, method = c("supervised", "cbow", "skipgram"), control = ft_control(), ... )
ft_train( file, method = c("supervised", "cbow", "skipgram"), control = ft_control(), ... )
file |
a character string giving the location of the input file. |
method |
a character string giving the method, possible values are
|
control |
a list giving the control variables, for more information
see |
... |
additional control arguments inserted into the control list. |
## Not run: cntrl <- ft_control(nthreads = 1L) model <- ft_train("my_data.txt", method="supervised", control = cntrl) ## End(Not run)
## Not run: cntrl <- ft_control(nthreads = 1L) model <- ft_train("my_data.txt", method="supervised", control = cntrl) ## End(Not run)
Obtain word vectors from a previously trained model.
ft_word_vectors(model, words)
ft_word_vectors(model, words)
model |
an object inheriting from |
words |
a character vector giving the words. |
a matrix containing the word vectors.
## Not run: ft_word_vectors(model, c("word", "vector")) ## End(Not run)
## Not run: ft_word_vectors(model, c("word", "vector")) ## End(Not run)
Obtain all the words from a previously trained model.
ft_words(model)
ft_words(model)
model |
an object inheriting from |
a character vector.
## Not run: ft_words(model) ## End(Not run)
## Not run: ft_words(model) ## End(Not run)
Predict values based on a previously trained model.
ft_predict( model, newdata, k = 1L, threshold = 0, rval = c("sparse", "dense", "slam"), ... )
ft_predict( model, newdata, k = 1L, threshold = 0, rval = c("sparse", "dense", "slam"), ... )
model |
an object inheriting from |
newdata |
a character vector giving the new data. |
k |
an integer giving the number of labels to be returned. |
threshold |
a double withing |
rval |
a character string controlling the return value, allowed
values are |
... |
currently not used. |
NULL
if a 'result_file'
is given otherwise
if 'prob'
is true a data.frame
with the predicted labels
and the corresponding probabilities, if 'prob'
is false a
character vector with the predicted labels.
## Not run: ft_predict(model, newdata) ## End(Not run)
## Not run: ft_predict(model, newdata) ## End(Not run)