Tuesday, March 4, 2014

GCC and vectorization

#include<ctime>
#include<iostream>
#include<memory>

using namespace std;

class Clock {
 public:
  Clock() : start_time_(std::clock()) {}

  std::clock_t Now() const { return std::clock() - start_time_; }

  double NowSeconds() const {
    return static_cast<double>(std::clock() - start_time_) / CLOCKS_PER_SEC;
  }

  void Reset() { start_time_ = std::clock(); }

 private:
  std::clock_t start_time_;
};

void Func(int* a, int *b, int* c, int size) {
  for(int i=0; i<size; ++i) {
    *a = *b * (*c);
    ++a; ++b; ++c;
  }
}

void FuncForSIMD(int* __restrict__ a, int* __restrict__ b, int* __restrict__ c,
                 int size) {
  for(int i=0; i<size; ++i) {
    *a = *b * (*c);
    ++a; ++b; ++c;
  }
}

int main() {
  constexpr int size = 1000000000;
  unique_ptr<int> a(new int[size]);
  unique_ptr<int> b(new int[size]);
  unique_ptr<int> c(new int[size]);
  Clock clock;
  Func(a.get(), b.get(), c.get(), size);
  cout << "time without simd: " << clock.Now() << endl;
  clock.Reset();
  FuncForSIMD(a.get(), b.get(), c.get(), size);
  cout << "time with simd: " << clock.Now() << endl;
  return 0;
}
To compile we need to run:
g++ -std=c++0x simd.cc -o simd -O3
The output on my machine looks as follow:
time without simd: 2160000
time with simd: 390000
So it gives 5.5x faster execution time.
Great explanation is given here:
Demystifying The Restrict Keyword - http://cellperformance.beyond3d.com/articles/2006/05/demystifying-the-restrict-keyword.html