// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "geometry.h"
#include "buffer.h"

namespace embree
{
  /*! Grid Mesh */
  struct GridMesh : public Geometry
  {
    /*! type of this geometry */
    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;

    /*! grid */
    struct Grid 
    {
      unsigned int startVtxID;
      unsigned int lineVtxOffset;
      unsigned short resX,resY;

      /* border flags due to 3x3 vertex pattern */
      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
      {
        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
      }

      /* border flags due to 3x3 vertex pattern */
      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
      {
        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
      }

      /*! outputs grid structure */
      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
      }
    };

  public:

    /*! grid mesh construction */
    GridMesh (Device* device); 

    /* geometry interface */
  public:
    void setMask(unsigned mask);
    void setNumTimeSteps (unsigned int numTimeSteps);
    void setVertexAttributeCount (unsigned int N);
    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
    void* getBuffer(RTCBufferType type, unsigned int slot);
    void updateBuffer(RTCBufferType type, unsigned int slot);
    void commit();
    bool verify();
    void interpolate(const RTCInterpolateArguments* const args);

    template<int N>
    void interpolate_impl(const RTCInterpolateArguments* const args)
    {
      unsigned int primID = args->primID;
      float U = args->u;
      float V = args->v;
      
      /* clamp input u,v to [0;1] range */
      U = max(min(U,1.0f),0.0f);
      V = max(min(V,1.0f),0.0f);
      
      RTCBufferType bufferType = args->bufferType;
      unsigned int bufferSlot = args->bufferSlot;
      float* P = args->P;
      float* dPdu = args->dPdu;
      float* dPdv = args->dPdv;
      float* ddPdudu = args->ddPdudu;
      float* ddPdvdv = args->ddPdvdv;
      float* ddPdudv = args->ddPdudv;
      unsigned int valueCount = args->valueCount;
      
      /* calculate base pointer and stride */
      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
      const char* src = nullptr; 
      size_t stride = 0;
      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
        src    = vertexAttribs[bufferSlot].getPtr();
        stride = vertexAttribs[bufferSlot].getStride();
      } else {
        src    = vertices[bufferSlot].getPtr();
        stride = vertices[bufferSlot].getStride();
      }
      
      const Grid& grid = grids[primID];
      const int grid_width  = grid.resX-1;
      const int grid_height = grid.resY-1;
      const float rcp_grid_width = rcp(float(grid_width));
      const float rcp_grid_height = rcp(float(grid_height));
      const int iu = min((int)floor(U*grid_width ),grid_width);
      const int iv = min((int)floor(V*grid_height),grid_height);
      const float u = U*grid_width-float(iu);
      const float v = V*grid_height-float(iv);
      
      for (unsigned int i=0; i<valueCount; i+=N)
      {
        const size_t ofs = i*sizeof(float);
        const unsigned int idx0 = grid.startVtxID + (iv+0)*grid.lineVtxOffset + iu;
        const unsigned int idx1 = grid.startVtxID + (iv+1)*grid.lineVtxOffset + iu;
        
        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+0)*stride+ofs]);
        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+1)*stride+ofs]);
        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+1)*stride+ofs]);
        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+0)*stride+ofs]);
        const vbool<N> left = u+v <= 1.0f;
        const vfloat<N> Q0 = select(left,p0,p2);
        const vfloat<N> Q1 = select(left,p1,p3);
        const vfloat<N> Q2 = select(left,p3,p1);
        const vfloat<N> U  = select(left,u,vfloat<N>(1.0f)-u);
        const vfloat<N> V  = select(left,v,vfloat<N>(1.0f)-v);
        const vfloat<N> W  = 1.0f-U-V;
        
        if (P) {
          mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2)));
        }
        if (dPdu) { 
          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)*rcp_grid_width);
          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)*rcp_grid_height);
        }
        if (ddPdudu) { 
          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
        }
      }
    }

    void addElementsToCount (GeometryCounts & counts) const;
    
    __forceinline unsigned int getNumTotalQuads() const
    {
      size_t quads = 0;
      for (size_t primID=0; primID<numPrimitives; primID++)
        quads += getNumQuads(primID);
      return quads;
    }

    __forceinline unsigned int getNumQuads(const size_t gridID) const
    {
      const Grid& g = grid(gridID);
      return (unsigned int) max((int)1,((int)g.resX-1) * ((int)g.resY-1));
    }
    
    __forceinline unsigned int getNumSubGrids(const size_t gridID) const
    {
      const Grid& g = grid(gridID);
      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
    }

    /*! get fast access to first vertex buffer */
    __forceinline float * getCompactVertexArray () const {
      return (float*) vertices0.getPtr();
    }

  public:

    /*! returns number of vertices */
    __forceinline size_t numVertices() const {
      return vertices[0].size();
    }
    
    /*! returns i'th grid*/
    __forceinline const Grid& grid(size_t i) const {
      return grids[i];
    }

    /*! returns i'th vertex of the first time step  */
    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
      return vertices0[i];
    }

    /*! returns i'th vertex of the first time step */
    __forceinline const char* vertexPtr(size_t i) const {
      return vertices0.getPtr(i);
    }

    /*! returns i'th vertex of itime'th timestep */
    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
      return vertices[itime][i];
    }

    /*! returns i'th vertex of for specified time */
    __forceinline const Vec3fa vertex(size_t i, float time) const
    {
      float ftime;
      const size_t itime = timeSegment(time, ftime);
      const float t0 = 1.0f - ftime;
      const float t1 = ftime;
      Vec3fa v0 = vertex(i, itime+0);
      Vec3fa v1 = vertex(i, itime+1);
      return madd(Vec3fa(t0),v0,t1*v1);
    }

    /*! returns i'th vertex of itime'th timestep */
    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
      return vertices[itime].getPtr(i);
    }

    /*! returns i'th vertex of the first timestep */
    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
      assert(x < (size_t)g.resX);
      assert(y < (size_t)g.resY);
      return g.startVtxID + x + y * g.lineVtxOffset;
    }
    
    /*! returns i'th vertex of the first timestep */
    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
      const size_t index = grid_vertex_index(g,x,y);
      return vertex(index);
    }

    /*! returns i'th vertex of the itime'th timestep */
    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
      const size_t index = grid_vertex_index(g,x,y);
      return vertex(index,itime);
    }

    /*! returns i'th vertex of the itime'th timestep */
    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, float time) const {
      const size_t index = grid_vertex_index(g,x,y);
      return vertex(index,time);
    }
    
    /*! gathers quad vertices */
    __forceinline void gather_quad_vertices(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y) const
    {
      v0 = grid_vertex(g,x+0,y+0);
      v1 = grid_vertex(g,x+1,y+0);
      v2 = grid_vertex(g,x+1,y+1);
      v3 = grid_vertex(g,x+0,y+1);
    }
    
    /*! gathers quad vertices for specified time */
    __forceinline void gather_quad_vertices(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y, float time) const
    {
      v0 = grid_vertex(g,x+0,y+0,time);
      v1 = grid_vertex(g,x+1,y+0,time);
      v2 = grid_vertex(g,x+1,y+1,time);
      v3 = grid_vertex(g,x+0,y+1,time);
    }

    /*! gathers quad vertices for mblur and non-mblur meshes */
    __forceinline void gather_quad_vertices_safe(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y, float time) const
    {
      if (hasMotionBlur()) gather_quad_vertices(v0,v1,v2,v3,g,x,y,time);
      else                 gather_quad_vertices(v0,v1,v2,v3,g,x,y);
    }

    /*! calculates the build bounds of the i'th quad, if it's valid */
    __forceinline bool buildBoundsQuad(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
    {
      BBox3fa b(empty);
      for (size_t t=0; t<numTimeSteps; t++)
      {
        for (size_t y=sy;y<sy+2;y++)
          for (size_t x=sx;x<sx+2;x++)
          {
            const Vec3fa v = grid_vertex(g,x,y,t);
            if (unlikely(!isvalid(v))) return false;
            b.extend(v);
          }
      }

      bbox = b;
      return true;
    }
    
    /*! calculates the build bounds of the i'th primitive, if it's valid */
    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
    {
      BBox3fa b(empty);
      for (size_t t=0; t<numTimeSteps; t++)
      {
        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
          {
            const Vec3fa v = grid_vertex(g,x,y,t);
            if (unlikely(!isvalid(v))) return false;
            b.extend(v);
          }
      }

      bbox = b;
      return true;
    }

    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
    {
      assert(itime < numTimeSteps);
      BBox3fa b0(empty);
      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
        {
          const Vec3fa v = grid_vertex(g,x,y,itime);
          if (unlikely(!isvalid(v))) return false;
          b0.extend(v);
        }

      /* use bounds of first time step in builder */
      bbox = b0;
      return true;
    }

    __forceinline bool valid(size_t gridID, size_t itime=0) const {
      return valid(gridID, make_range(itime, itime));
    }

    /*! check if the i'th primitive is valid between the specified time range */
    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
    {
      if (unlikely(gridID >= grids.size())) return false;
      const Grid &g = grid(gridID);
      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;

      for (size_t y=0;y<g.resY;y++)
        for (size_t x=0;x<g.resX;x++)
          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
      return true;
    }

    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
    {
      BBox3fa box(empty);
      buildBounds(g,sx,sy,itime,box);
      return box;
    }

    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
      BBox3fa bounds0, bounds1;
      buildBounds(g,sx,sy,itime+0,bounds0);
      buildBounds(g,sx,sy,itime+1,bounds1);
      return LBBox3fa(bounds0,bounds1);
    }

    /*! calculates the linear bounds of the i'th primitive for the specified time range */
    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
    }

    __forceinline float projectedPrimitiveArea(const size_t i) const {
      return pos_inf;
    }

  public:
    BufferView<Grid> grids;      //!< array of triangles
    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
    Device::vector<BufferView<Vec3fa>> vertices = device; //!< vertex array for each timestep
    Device::vector<RawBufferView> vertexAttribs = device; //!< vertex attributes

#if defined(EMBREE_SYCL_SUPPORT)
    
  public:
    struct PrimID_XY { uint32_t primID; uint16_t x,y; };
    Device::vector<PrimID_XY> quadID_to_primID_xy = device;  //!< maps a quad to the primitive ID and grid coordinates
#endif
  };

  namespace isa
  {
    struct GridMeshISA : public GridMesh
    {
      GridMeshISA (Device* device)
        : GridMesh(device) {}

      LBBox3fa vlinearBounds(size_t buildID, const BBox1f& time_range, const SubGridBuildData * const sgrids) const override {
        const SubGridBuildData &subgrid = sgrids[buildID];                      
        const unsigned int primID = subgrid.primID;
        const size_t x = subgrid.x();
        const size_t y = subgrid.y();
        return linearBounds(grid(primID),x,y,time_range);
      }

#if defined(EMBREE_SYCL_SUPPORT)
      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const override
      {
        PrimInfo pinfo(empty);
        for (size_t j=r.begin(); j<r.end(); j++)
        {
          BBox3fa bounds = empty;
          const PrimID_XY& quad = quadID_to_primID_xy[j];
          if (!buildBoundsQuad(grids[quad.primID],quad.x,quad.y,bounds)) continue;
          const PrimRef prim(bounds,geomID,unsigned(j));
          pinfo.add_center2(prim);
          prims[k++] = prim;
        }
        return pinfo;
      }
#endif
      
      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids, const range<size_t>& r, size_t k, unsigned int geomID) const override 
      {
        PrimInfo pinfo(empty);
        for (size_t j=r.begin(); j<r.end(); j++)
        {
          if (!valid(j)) continue;
          const GridMesh::Grid &g = grid(j);
          
          for (unsigned int y=0; y<g.resY-1u; y+=2)
          {
            for (unsigned int x=0; x<g.resX-1u; x+=2)
            {
              BBox3fa bounds = empty;
              if (!buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
              const PrimRef prim(bounds,(unsigned)geomID,(unsigned)k);
              pinfo.add_center2(prim);
              sgrids[k] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
              prims[k++] = prim;                
            }
          }
        }
        return pinfo;
      }

#if defined(EMBREE_SYCL_SUPPORT)
      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const override
      {
        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
        PrimInfo pinfo(empty);
        for (size_t j=r.begin(); j<r.end(); j++)
        {
          const PrimID_XY& quad = quadID_to_primID_xy[j];
          const LBBox3fa lbounds = linearBounds(grids[quad.primID],quad.x,quad.y,t0t1);
          const PrimRef prim(lbounds.bounds(), unsigned(geomID), unsigned(j));
          pinfo.add_center2(prim);
          prims[k++] = prim;
        }
        return pinfo;
      }
#endif

      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const override
      {
        PrimInfoMB pinfoMB(empty);
        for (size_t j=r.begin(); j<r.end(); j++)
        {
          if (!valid(j, timeSegmentRange(t0t1))) continue;
          const GridMesh::Grid &g = grid(j);
          
          for (unsigned int y=0; y<g.resY-1u; y+=2)
          {
            for (unsigned int x=0; x<g.resX-1u; x+=2)
            {
              const PrimRefMB prim(linearBounds(g,x,y,t0t1),numTimeSegments(),time_range,numTimeSegments(),unsigned(geomID),unsigned(k));
              pinfoMB.add_primref(prim);
              sgrids[k] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
              prims[k++] = prim;
            }
          }
        }
        return pinfoMB;
      }
    };
  }

  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
}