## S3 method for class 'cluster_pairs'select_greedy( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL,...)## S3 method for class 'cluster_pairs'select_n_to_m( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL,...)select_greedy( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL,...)## S3 method for class 'pairs'select_greedy( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL, x = attr(pairs,"x"), y = attr(pairs,"y"), inplace =FALSE, include_ties =FALSE, n =1L, m =1L,...)select_n_to_m( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL,...)## S3 method for class 'pairs'select_n_to_m( pairs, variable, score, threshold, preselect =NULL, id_x =NULL, id_y =NULL, x = attr(pairs,"x"), y = attr(pairs,"y"), inplace =FALSE,...)
Arguments
pairs: a pairs object, such as generated by pair_blocking
variable: the name of the new variable to create in pairs. This will be a logical variable with a value of TRUE for the selected pairs.
score: name of the score/weight variable of the pairs. When not given and attr(pairs, "score") is defined, that is used.
threshold: the threshold to apply. Pairs with a score above the threshold are selected.
preselect: a logical variable with the same length as pairs has rows, or the name of such a variable in pairs. Pairs are only selected when preselect is TRUE. This interacts with threshold (pairs have to be selected with both conditions).
id_x: a integer vector with the same length as the number of rows in pairs, or the name of a column in x. This vector should identify unique objects in x. When not specified it is assumed that each element in x is unique.
id_y: a integer vector with the same length as the number of rows in pairs, or the name of a column in y. This vector should identify unique objects in y. When not specified it is assumed that each element in y is unique.
...: Used to pass additional arguments to methods
x: data.table with one half of the pairs.
y: data.table with the other half of the pairs.
inplace: logical indicating whether pairs should be modified in place. When pairs is large this can be more efficient.
include_ties: when pairs for a given record have an equal weight, should all pairs be included.
n: an integer. Each element of x can be linked to at most n elements of y.
m: an integer. Each element of y can be linked to at most m elements of x.
Returns
Returns the pairs with the variable given by variable added. This is a logical variable indicating which pairs are selected as matches.
Details
Both methods force one-to-one matching. select_greedy uses a greedy algorithm that selects the first pair with the highest weight. select_n_to_m tries to optimise the total weight of all of the selected pairs. In general this will result in a better selection. However, select_n_to_m uses much more memory and is much slower and, therefore, can only be used when the number of possible pairs is not too large.
Note that when include_ties = TRUE the same record can still be selected more than once. In that case the pairs will have an equal weight.
Examples
data("linkexample1","linkexample2")pairs <- pair_blocking(linkexample1, linkexample2,"postcode")pairs <- compare_pairs(pairs, c("lastname","firstname","address","sex"))model <- problink_em(~ lastname + firstname + address + sex, data = pairs)pairs <- predict(model, pairs, type ="mpost", add =TRUE, binary =TRUE)# Select pairs with a mpost > 0.5 and force one-to-one linkagepairs <- select_n_to_m(pairs,"ntom","mpost",0.5)pairs <- select_greedy(pairs,"greedy","mpost",0.5)table(pairs$ntom, pairs$greedy)# The same example as above using a cluster;library(parallel)cl <- makeCluster(2)pairs <- cluster_pair_blocking(cl, linkexample1, linkexample2,"postcode")compare_pairs(pairs, c("lastname","firstname","address","sex"))model <- problink_em(~ lastname + firstname + address + sex, data = pairs)predict(model, pairs, type ="mpost", add =TRUE, binary =TRUE)# Select pairs with a mpost > 0.5 and force one-to-one linkage# select_n_to_m and select_greedy only work on pairs that are local; # therefore we first collect the pairsselect_threshold(pairs,"selected","mpost",0.5)local_pairs <- cluster_collect(pairs,"selected")local_pairs <- select_n_to_m(local_pairs,"ntom","mpost",0.5)local_pairs <- select_greedy(local_pairs,"greedy","mpost",0.5)table(local_pairs$ntom, local_pairs$greedy)stopCluster(cl)