Just comparing different approaches (Not considering radix sort):
#include <algorithm>
#include <deque>
#include <iostream>
#include <unordered_set>
#include <set>
#include <vector>
#include <chrono>
template <template <typename ...> class Container, typename T, typename ... A, typename Comp>
inline bool insert_sorted(Container<T, A...>& container, T const& e, Comp const& comp) {
auto const it = std::lower_bound(container.begin(), container.end(), e, comp);
if (it != container.end() and not comp(e, *it)) { return false; }
container.insert(it, e);
return true;
}
template <template <typename ...> class Container, typename T, typename ... A>
inline bool insert_sorted(Container<T, A...>& container, T const& e) {
return insert_sorted(container, e, std::less<T>{});
}
int main() {
using namespace std::chrono;
typedef std::vector<int> data_type;
const unsigned Size = unsigned(1) << 24;
const unsigned Limit = 1000;
data_type data;
data.reserve(Size);
for(unsigned i = 0; i < Size; ++i) {
int value = double(Limit) * std::rand() / RAND_MAX - 0.1;
data.push_back(value);
while(i < Size - 1 && rand() < RAND_MAX * 0.25) {
data.push_back(value);
++i;
}
}
std::cout
<< "Data\n"
<< "====\n"
<< " Size of data: " << Size << '\n';
std::cout
<< "Unorderd Set\n"
<< "============\n";
{
auto start = system_clock::now();
typedef std::unordered_set<int> set_type;
set_type set;
unsigned i = 0;
for( ; i < Size - 1; ++i) {
// Ignore a range of equal values
while(data[i] == data[i+1]) ++i;
set.insert(data[i]);
}
if(i < Size)
set.insert(data[i]);
auto stop = system_clock::now();
std::cout
<< "Number of different elements: "
<< set.size() << '\n';
std::cout
<< " Timing: "
<< duration_cast<duration<double>>(stop - start).count()
<< '\n';
}
std::cout
<< "Set\n"
<< "===\n";
{
auto start = system_clock::now();
typedef std::set<int> set_type;
set_type set;
unsigned i = 0;
for( ; i < Size - 1; ++i) {
// Ignore a range of equal values
while(data[i] == data[i+1]) ++i;
set.insert(data[i]);
}
if(i < Size)
set.insert(data[i]);
auto stop = system_clock::now();
std::cout
<< "Number of different elements: "
<< set.size() << '\n';
std::cout
<< " Timing: "
<< duration_cast<duration<double>>(stop - start).count()
<< '\n';
}
std::cout
<< "Sorted Vector\n"
<< "=============\n";
{
auto start = system_clock::now();
typedef std::vector<int> set_type;
set_type set;
unsigned i = 0;
for( ; i < Size - 1; ++i) {
// Ignore a range of equal values
while(data[i] == data[i+1]) ++i;
insert_sorted(set, data[i]);
}
if(i < Size)
insert_sorted(set, data[i]);
auto stop = system_clock::now();
std::cout
<< "Number of different elements: "
<< set.size() << '\n';
std::cout
<< " Timing: "
<< duration_cast<duration<double>>(stop - start).count()
<< '\n';
}
std::cout
<< "BitVector\n"
<< "=========\n";
{
auto start = system_clock::now();
typedef std::vector<bool> set_type;
set_type set(Limit);
unsigned i = 0;
unsigned elements = 0;
for( ; i < Size; ++i) {
if( ! set[data[i]]) {
set[data[i]] = true;
++elements;
}
}
auto stop = system_clock::now();
std::cout
<< "Number of different elements: "
<< elements << '\n';
std::cout
<< " Timing: "
<< duration_cast<duration<double>>(stop - start).count()
<< '\n';
}
std::cout
<< "Sorted Data\n"
<< "===========\n";
{
auto start = system_clock::now();
std::sort(data.begin(), data.end());
auto last = std::unique(data.begin(), data.end());
auto stop = system_clock::now();
std::cout
<< "Number of different elements: "
<< last - data.begin() << '\n';
std::cout
<< " Timing: "
<< duration_cast<duration<double>>(stop - start).count()
<< '\n';
}
return 0;
}
Compiled with g++ -std=c++11 -O3 gives:
Data
====
Size of data: 16777216
Unorderd Set
============
Number of different elements: 1000
Timing: 0.269752
Set
===
Number of different elements: 1000
Timing: 1.23478
Sorted Vector
=============
Number of different elements: 1000
Timing: 1.13783
BitVector
=========
Number of different elements: 1000
Timing: 0.038408
Sorted Data
===========
Number of different elements: 1000
Timing: 1.32827
Hence if memory is no issue or the range of numbers is limited setting a bit is the best choice. Otherwise, unordered_set is a good one.