//MC# Parallel Fast Fourier Transform (pFFT)
//This implementation is based on the Cooley-Tukey algorithm
//
using System;
using System.Text;

public class Complex
{
    public double Re = 0.0;
    public double Im = 0.0;

    public Complex() { }
    public Complex(double re, double im){ Re = re; Im = im; }
}

class Program
{
 public static void Main(string[] args)
    {
        if (args.Length!=3){
            Console.WriteLine(Usage());
            return;
        }

  //n - input vector length (must be power of two)
  //n1 - number of Cooley-Tukey's matrix columns
  //n2 - number of Cooley-Tukey's matrix rows
  //np - number of processes

  int n=0, n1 = 0, n2 = 0, np=0;
        n = (int)Math.Pow(2,Int32.Parse(args[0]));
  n1 = Int32.Parse(args[1]);
  np = Int32.Parse(args[2]);

  //input vector generation
  Complex[] X = new Complex[n];
        Random r = new Random();
        for (int i = 0; i < n; i++)
            X[i] = new Complex(r.NextDouble() * 10, 0);

        if ((n1 > n) || (n1 <= 0))
        {
         Console.WriteLine(Usage() + "Param n1 is invalid: n1=" + n1.ToString()+  ". Vector length=" + X.Length.ToString());
            return;
        }

        n2 = n / n1;

  Console.WriteLine("*RUN*");

        DateTime dt1 = DateTime.Now;
        Complex[] pY = (new Program()).FFT(X, n1, n2,np);
        DateTime dt2 = DateTime.Now;

  Console.WriteLine(" Parallel FFT:");
  Console.WriteLine(" n=" + n + " n1=" + n1 + " n2=" + n2 +
        "   Elapsed time is " + (dt2 - dt1).TotalSeconds);
    }

 public handler Wait void ()& channel Ready(){return ;}

 //    Cooley-Tukey Algorithm

    public Complex[] FFT(Complex[] X, int n1, int n2, int np)
    {
       int n = X.Length;
      Complex[] Y = new Complex[n];
      Complex[] P = new Complex[n];
      Complex[] T = new Complex[n];

        for (int k = 0; k < np; k++)
            Column_Iterative_Serial_FFT(X, Y, k * n2*n1/np, n1,n2,np);
        for (int k = 0; k < np; k++)
           Wait?();

        for (int i = 0; i < np; i++)
            Row_Iterative_Serial_FFT(Y, P, T, i * n1*n2/np, n1,n2,np);
       for (int i = 0; i < np; i++)
           Wait?();

        return T;
    }

 public async Column_Iterative_Serial_FFT(Complex[] a, Complex[] A, int start, int n1, int n2,int np)
 {
  int block_len=n1/np;
  for (int k = 0; k < block_len; k++)
            Column_Iterative_FFT(a, A, start+k * n2, n1, n2);

  //Column Twiddle Factor Multiplication

  int x,y;          //x,y - indexes of column and row of transpose matrix accordingly
  double wn_Re=0,arg = 0,wn_Im=0,tmp=0;
  int ks,t=0,j=0,i=0;
  for (int q = 0; q < n2*block_len; q++)
        {
            j = (int)Math.Ceiling((double)(q + 1) / n2) - 1;
            i = q - ((q) / n2) * n2;
            t =start+j * n2+i;

           split_idx(t,n2,n1,out x, out y);
            arg = 2 * Math.PI * x * y / (n2 * n1);
            wn_Re = Math.Cos(arg);
            wn_Im = Math.Sin(arg);

            ks=t/n2+n1*(t-n2*(t/n2));

           tmp = A[ks].Re * wn_Re - A[ks].Im * wn_Im;
          A[ks].Im = A[ks].Re * wn_Im + A[ks].Im * wn_Re;
          A[ks].Re = tmp;
   }
  Ready ! ();
 }

 public async Row_Iterative_Serial_FFT(Complex[] a, Complex[] A, Complex[] T,int start, int n1,int n2,int np)
 {
  for (int i = 0; i < n2/np; i++)
  {
   Row_Iterative_FFT(a, A, start+i * n1, n1);

   //Sort elements in right order to create output vector

   for(int q=0;q<n1;q++)
   T[TL_idx(start+i * n1+q,n1,n2)]=A[start+i * n1+q];
  }
  Ready ! ();
 }


 //Translate index i of untransposed vector to linearized index of transposed vector
    int TL_idx(int i, int raw_len, int col_len)
    {
        int raw_i = i / (raw_len);
        return ( raw_i + (i - raw_i * raw_len) * col_len );
    }

 //index delinearization
 void split_idx(int i,int raw_len, int col_len, out int x, out int y)
 {
   y = i / (raw_len);
   x=i - y * raw_len;
 }

public void Column_Iterative_FFT(Complex[] a, Complex[] A, int start, int n1, int n2)
 {
  int j, k, m, m2, km2,ktemp, s;
  double wn_Re, wn_Im, w_Re, w_Im;
  double arg, t_Re, t_Im;
  double u_Re, u_Im, tmp;

  int logN = 0;
  m = n2;

  while( m > 1 )
  {
   m = m / 2;
   logN++;
  }

  int iTbcr=0;
  int ibcr=0;
  int temp;
  for (k = 0; k < n2; k++)
  {
   temp=bit_reverse(k, logN);
   iTbcr=(temp + start) / n2 + n1*(temp + start - ((temp + start)/ n2) * n2);
   ibcr=(k + start) / n2 + n1*(k + start - ((k + start)/ n2) * n2);
   A[iTbcr]=a[ibcr];
  }

  for ( s = 1; s <= logN; s++ )
  {
   m = 1 << s;

   arg = 2.0 * Math.PI / m;

   wn_Re = Math.Cos ( arg );
   wn_Im = Math.Sin ( arg );

   w_Re = 1.0;w_Im = 0.0;

   m2 = m >> 1;
   for ( j = 0; j < m2; j++ )
   {
    for ( k = j; k < n2; k += m )
    {
     km2=(k + m2 + start)/n2 + n1*(k + m2 + start - ((k + m2 + start)/ n2) * n2);
     ktemp=(k + start)/n2 + n1*(k + start - ((k + start)/ n2) * n2);

     t_Re = w_Re * A [ km2 ].Re - w_Im * A [ km2 ].Im;
     t_Im = w_Re * A [ km2 ].Im + w_Im * A [ km2 ].Re;

     u_Re = A[ktemp].Re;
     u_Im = A[ktemp].Im;

     A[ktemp].Re = u_Re + t_Re;
     A[ktemp].Im = u_Im + t_Im;

     A [km2 ].Re = u_Re - t_Re;
     A [ km2].Im = u_Im - t_Im;
    }
   tmp = w_Re * wn_Re - w_Im * wn_Im;
   w_Im = w_Re * wn_Im + w_Im * wn_Re;
   w_Re = tmp;
   }

  }

 }

 public void Row_Iterative_FFT(Complex[] a, Complex[] A, int start, int n1)
{
  int j, k, m, m2, km2, s;
  double wn_Re, wn_Im, w_Re, w_Im;
  double arg, t_Re, t_Im;
  double u_Re, u_Im, tmp;

  int logN = 0;
  m = n1;
  while( m > 1 )
  {
   m = m / 2;
   logN++;
  }

  for ( k = 0; k < n1; k++ )
   A[bit_reverse(k, logN) + start] = a[k + start];

  for ( s = 1; s <= logN; s++ )
  {
   m = 1 << s;

   arg = 2.0 * Math.PI / m;

   wn_Re = Math.Cos ( arg );
   wn_Im = Math.Sin ( arg );

   w_Re = 1.0;w_Im = 0.0;

   m2 = m >> 1;
   for ( j = 0; j < m2; j++ )
   {
    for ( k = j; k < n1; k += m )
    {
     km2 = k + m2+start;

     t_Re = w_Re * A[ km2 ].Re - w_Im * A[ km2 ].Im;
     t_Im = w_Re * A[ km2 ].Im + w_Im * A[ km2 ].Re;

     u_Re = A[k + start].Re;
     u_Im = A[k + start].Im;

     A[k + start].Re = u_Re + t_Re;
     A[k + start].Im = u_Im + t_Im;

     A[ km2 ].Re = u_Re - t_Re;
     A[ km2 ].Im = u_Im - t_Im;
    }

   tmp = w_Re * wn_Re - w_Im * wn_Im;
   w_Im = w_Re * wn_Im + w_Im * wn_Re;
   w_Re = tmp;
   }
  }
 }

    public  int bit_reverse(int k, int size)
    {
        int right_unit = 1;
        int left_unit = 1 << (size - 1);

        int result = 0, bit;

        for (int i = 0; i < size; i++)
        {
            bit = k & right_unit;
            if (bit != 0)
                result = result | left_unit;
            right_unit = right_unit << 1;
            left_unit = left_unit >> 1;
        }
        return (result);
    }

    static string Usage()
    {
        StringBuilder s = new StringBuilder();
       
for (int i = 0; i < Console.WindowWidth; i++) s.Append("-");
        return  ( s.ToString() + Environment.NewLine +
                      "Usage: pFFT.exe p n1 np" + Environment.NewLine +
                      "where" + Environment.NewLine +
                      " p -  n=2^p - input vector length" + Environment.NewLine +
                      " n1 - width of block" + Environment.NewLine +
                     " np - number of processes" + Environment.NewLine +
                     s.ToString() + Environment.NewLine );
    }
}