
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "openglheader.h"

#include "utilities.h"
#include "GPUsparsemat.h"

static GLuint program_id;
static GLuint uloc[7];

void LoadGPUTransposefShader ( void )
{
  static const char *filename[] = { "GPUtransposef.comp.glsl" };
  static const GLchar *uname[] =
    { "stage", "step", "reverse", "h", "m", "n", "nnz" };
  GLuint shader_id;
  int    i;

  shader_id = CompileShaderFiles ( GL_COMPUTE_SHADER, 1, &filename[0] );
  program_id = LinkShaderProgram ( 1, &shader_id, "GPUTransposef" );;
  for ( i = 0; i < 7; i++ )
    uloc[i] = glGetUniformLocation ( program_id, uname[i] );
  glDeleteShader ( shader_id );
/*PrintProgramResources ( program_id, "GPUTransposef" );*/
  ExitIfGLError ( "LoadGPUTransposefShader" );
} /*LoadGPUTransposefShader*/

void DeleteGPUTransposefShader ( void )
{
  glDeleteProgram ( program_id );
} /*DeleteGPUTransposefShader*/

#define COMPUTE(SIZEX,SIZEY,SIZEZ) \
  glDispatchCompute ( SIZEX, SIZEY, SIZEZ ); \
  glMemoryBarrier ( GL_SHADER_STORAGE_BARRIER_BIT );
#define EXECSTAGE(STAGE,SIZEX,SIZEY,SIZEZ) \
  { glUniform1ui ( uloc[0], STAGE ); COMPUTE ( SIZEX, SIZEY, SIZEZ ) }

static void NetSort ( GLuint nseq, GLuint n, GLuint revloc, GLuint hloc )
{
  GLuint steps, nn, h, h2, i, gsize;

  if ( n < 2 )
    return;
  for ( nn = n-1, steps = 0;  nn;  nn >>= 1, steps ++ )
    ;
  nn  = 1 << steps;  gsize = nn/2;
  glUniform1ui ( uloc[0], 2 );
  for ( i = 0, h2 = 1, h = 2;  i < steps;  i++, h2 = h, h += h ) {
    glUniform1ui ( revloc, GL_TRUE );
    glUniform1ui ( hloc, h );
    COMPUTE ( gsize, nseq, 1 );
    glUniform1ui ( revloc, GL_FALSE );
    for ( ;  h2 > 1;  h2 >>= 1 ) {
      glUniform1ui ( hloc, h2 );
      COMPUTE ( gsize, nseq, 1 );
    }
  }
  ExitIfGLError ( "NetSort" );
} /*NetSort*/

char GPUTransposeSparsef ( GPUSparseMatrix *at, GPUSparseMatrix *a,
                           char keep_a )
{
  GLuint atb[3];
  GLuint m, n, nnz;

  at->n = m = a->m;  at->m = n = a->n;  at->nnz = nnz = a->nnz;
  glUseProgram ( program_id );
  glBindBufferBase ( GL_SHADER_STORAGE_BUFFER, 0, a->buf[0] );
  glUniform1ui ( uloc[4], m );
  glUniform1ui ( uloc[5], n );
  glUniform1ui ( uloc[6], nnz );
  ExitIfGLError ( "GPUTransposeSparsef 0" );
  if ( keep_a ) {
    glGenBuffers ( 3, atb );
    glBindBufferBase ( GL_SHADER_STORAGE_BUFFER, 1, a->buf[1] );
    glBindBufferBase ( GL_SHADER_STORAGE_BUFFER, 3, at->buf[1] = atb[2] );
    glBufferData ( GL_SHADER_STORAGE_BUFFER, nnz*sizeof(GLfloat),
                   NULL, GL_DYNAMIC_DRAW );
    EXECSTAGE ( 0, nnz, 1, 1 );
    ExitIfGLError ( "GPUTransposeSparsef 1" );
  }
  else {
    glGenBuffers ( 2, atb );
    glBindBufferBase ( GL_SHADER_STORAGE_BLOCK, 3, at->buf[1] = a->buf[1] );
  }
  glBindBufferBase ( GL_SHADER_STORAGE_BUFFER, 2, at->buf[0] = atb[0] );
  glBufferData ( GL_SHADER_STORAGE_BUFFER, (n+1+nnz)*sizeof(GLuint),
                 NULL, GL_DYNAMIC_DRAW );
  glBindBufferBase ( GL_SHADER_STORAGE_BUFFER, 4, atb[1] );
  glBufferData ( GL_SHADER_STORAGE_BUFFER, nnz*sizeof(GLuint),
                 NULL, GL_DYNAMIC_DRAW );
  EXECSTAGE ( 1, nnz, 1, 1 );
  NetSort ( 1, nnz, uloc[2], uloc[3] );
  ExitIfGLError ( "GPUTransposeSparsef 3" );
  EXECSTAGE ( 3, n+1, 1, 1 );
  glDeleteBuffers ( 1, &atb[1] );
  if ( !keep_a ) {
    glDeleteBuffers ( 1, &a->buf[0] );
    memset ( a, 0, sizeof(GPUSparseMatrix) );
  }
  return true;
} /*GPUTransposeSparsef*/

