Assuming your sales
dataset is 3,000 * 300 = 900,000
rows and both dataframes have a customer_id
column, you can do something like:
pred_groups <- split(seq_len(nrow(sales_score)), sales_score$customer_id)
# pred_groups is now a list, with names the customer_id's and each list
# element an integer vector of row numbers. Now iterate over each customer
# and make predictions on the training set.
preds <- unsplit(structure(lapply(names(pred_groups), function(customer_id) {
# Train using only observations for this customer.
# Note we are comparing character to integer but R's natural type
# coercion should still give the correct answer.
train_rows <- sales$customer_id == customer_id
sales.rf <- randomForest(Sales ~ ., ntree = 500,
data = sales[train_rows, ],importance=TRUE)
# Now make predictions only for this customer.
predict(sales.rf, sales_score[pred_groups[[customer_id]], ])
}), .Names = names(pred_groups)), sales_score$customer_id)
print(head(preds)) # Should now be a vector of predicted scores of length
# the number of rows in the train set.
Edit: Per @joran, here is a solution with a for
:
pred_groups <- split(seq_len(nrow(sales_score)), sales_score$customer_id)
preds <- numeric(nrow(sales_score))
for(customer_id in names(pred_groups)) {
train_rows <- sales$customer_id == customer_id
sales.rf <- randomForest(Sales ~ ., ntree = 500,
data = sales[train_rows, ],importance=TRUE)
pred_rows <- pred_groups[[customer_id]]
preds[pred_rows] <- predict(sales.rf, sales_score[pred_rows, ])
})