Intel® Embree Ray Tracing Kernels
Discussion forum on the open source ray tracing kernels for fast photo-realistic rendering on Intel® CPU(s)

Building BVH4 or BVH8

Waxman__Evan
Beginner
2,101 Views

Hello,

I am trying to create a High Quality BVH4 structure using the BVH builder tutorial. I am aware that it is creating a BVH2 structure and I tried modifying a few arguments to accomplish this task. MaxBranchingFactor within the RTCBuildArguments was changed to a value of 4. Within the InnerNode stuct, I also increased the number of bounds and children to 4. When calling setChildren and setBounds, I made sure that it would loop through all 4 children and bounds instead of 2 as it was before. However, it seems to break when creating the leaf nodes. Could anyone point me in the right direction on how to create a BVH4/8 structure using the BVH builder tutorial?

Thank you,

Evan

0 Kudos
4 Replies
BenthinC_Intel
Employee
2,101 Views

Hi Evan,

could you post or PM your code so that I can have a look? Do you get a segfault when creating the leaf nodes? For a high quality BVH you also need to reserve some extra space (BuildSettings.extraSpace) to store spatial splits. Other than that one needs to modify a couple of BuildSettings entries like sahBlockSize, maxBranchingFactor, minLeafSize, maxLeafSize, quality etc to get a good high-quality n-wide BVH, but that should be it.

Carsten.   

0 Kudos
Waxman__Evan
Beginner
2,101 Views

Hi Carsten,

Thank you for the quick response. I commented out the asserts in the create(), setChildren(), and setBounds() functions so my code is no longer breaking when creating the leaf nodes. I also modified sah() to account for 4 children instead of 2, however it looks like not all inner nodes are being created with 4 children which is causing sah() to throw an exception and halt the program. Is there not supposed to be 4 children created at every inner node? Is this a result of my BuildSettings? Also how do you determine an appropriate amount of extraSpace and sahBlockSize?

Thank you for you help, here is my bvh_builder_device.cpp file:

// ======================================================================== //
// Copyright 2009-2018 Intel Corporation                                    //
//                                                                          //
// Licensed under the Apache License, Version 2.0 (the "License");          //
// you may not use this file except in compliance with the License.         //
// You may obtain a copy of the License at                                  //
//                                                                          //
//     http://www.apache.org/licenses/LICENSE-2.0                           //
//                                                                          //
// Unless required by applicable law or agreed to in writing, software      //
// distributed under the License is distributed on an "AS IS" BASIS,        //
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
// See the License for the specific language governing permissions and      //
// limitations under the License.                                           //
// ======================================================================== //

#include "../common/tutorial/tutorial_device.h"

namespace embree
{	
  RTCScene g_scene  = nullptr;

  /* This function is called by the builder to signal progress and to
   * report memory consumption. */
  bool memoryMonitor(void* userPtr, ssize_t bytes, bool post) {
    return true;
  }

  bool buildProgress (void* userPtr, double f) {
    return true;
  }

  void splitPrimitive (const RTCBuildPrimitive* prim, unsigned int dim, float pos, RTCBounds* lprim, RTCBounds* rprim, void* userPtr)
  {
    assert(dim < 3);
    assert(prim->geomID == 0);
    *(BBox3fa*) lprim = *(BBox3fa*) prim;
    *(BBox3fa*) rprim = *(BBox3fa*) prim;
    (&lprim->upper_x)[dim] = pos;
    (&rprim->lower_x)[dim] = pos;
  }

  struct Node
  {
    virtual float sah() = 0;
	unsigned int nodeType;
  };

  struct InnerNode : public Node
  {
    BBox3fa bounds[4];
    Node* children[4];

    InnerNode() {
      bounds[0] = empty; 
	  bounds[1] = empty;
	  bounds[3] = empty; 
	  bounds[4] = empty;
	  children[0] = nullptr; 
	  children[1] = nullptr; 
	  children[2] = nullptr; 
	  children[3] = nullptr;
	  nodeType = 1;
    }

    float sah() {
      return 1.0f + (area(bounds[0])*children[0]->sah() + area(bounds[1])*children[1]->sah() + area(bounds[2])*children[2]->sah() + area(bounds[3])*children[3]->sah())/area(merge(bounds[0],bounds[1],bounds[2],bounds[3]));
    }

    static void* create (RTCThreadLocalAllocator alloc, unsigned int numChildren, void* userPtr)
    {
      //assert(numChildren == 2);
      void* ptr = rtcThreadLocalAlloc(alloc,sizeof(InnerNode),16);
      return (void*) new (ptr) InnerNode;
    }

    static void  setChildren (void* nodePtr, void** childPtr, unsigned int numChildren, void* userPtr)
    {
      //assert(numChildren == 2);
      for (size_t i=0; i<numChildren; i++)
        ((InnerNode*)nodePtr)->children = (Node*) childPtr;
    }

    static void  setBounds (void* nodePtr, const RTCBounds** bounds, unsigned int numChildren, void* userPtr)
    {
      //assert(numChildren == 2);
      for (size_t i=0; i<numChildren; i++)
        ((InnerNode*)nodePtr)->bounds = *(const BBox3fa*) bounds;
    }
  };
  //leaf node seems to be everything on the bottom/lowest nodes/triangles
  struct LeafNode : public Node
  {
    unsigned id;
    BBox3fa bounds;

	LeafNode(unsigned id_new, const BBox3fa& bounds_new) {
		id = id_new;
		bounds = bounds_new;
		nodeType = 2;
	}

    float sah() {
      return 1.0f;
    }

    static void* create (RTCThreadLocalAllocator alloc, const RTCBuildPrimitive* prims, size_t numPrims, void* userPtr)
    {
      assert(numPrims == 1);
      void* ptr = rtcThreadLocalAlloc(alloc,sizeof(LeafNode),16);
      return (void*) new (ptr) LeafNode(prims->primID,*(BBox3fa*)prims);
    }
  };
  

  void build(RTCBuildQuality quality, avector<RTCBuildPrimitive>& prims_i, char* cfg, size_t extraSpace = 0)
  {
    rtcSetDeviceMemoryMonitorFunction(g_device,memoryMonitor,nullptr);

    RTCBVH bvh = rtcNewBVH(g_device);

    avector<RTCBuildPrimitive> prims;
    prims.reserve(prims_i.size()+extraSpace);
    prims.resize(prims_i.size());

    /* settings for BVH build */
    RTCBuildArguments arguments = rtcDefaultBuildArguments();
    arguments.byteSize = sizeof(arguments);
    arguments.buildFlags = RTC_BUILD_FLAG_DYNAMIC;
    arguments.buildQuality = quality;
    arguments.maxBranchingFactor = 4;
    arguments.maxDepth = 1024;
    arguments.sahBlockSize = 2;
    arguments.minLeafSize = 1;
    arguments.maxLeafSize = 1;
    arguments.traversalCost = 1.0f;
    arguments.intersectionCost = 1.0f;
    arguments.bvh = bvh;
    arguments.primitives = prims.data();
    arguments.primitiveCount = prims.size();
    arguments.primitiveArrayCapacity = prims.capacity();
    arguments.createNode = InnerNode::create;
    arguments.setNodeChildren = InnerNode::setChildren;
    arguments.setNodeBounds = InnerNode::setBounds;
    arguments.createLeaf = LeafNode::create;
    arguments.splitPrimitive = splitPrimitive;
    arguments.buildProgress = buildProgress;
    arguments.userPtr = nullptr;
    

	/* we recreate the prims array here, as the builders modify this array */
	for (size_t j = 0; j < prims.size(); j++) {
		prims = prims_i;
		
	}

    std::cout << "Building BVH over " << prims.size() << " primitives, " << std::flush;
    double t0 = getSeconds();
    Node* root = (Node*) rtcBuildBVH(&arguments);
    double t1 = getSeconds();
    const float sah = root ? root->sah() : 0.0f;
    std::cout << 1000.0f*(t1-t0) << "ms, " << 1E-6*double(prims.size())/(t1-t0) << " Mprims/s, sah = " << sah << " [DONE]" << std::endl;


    rtcReleaseBVH(bvh);
  }

  /* called by the C++ code for initialization */
  extern "C" void device_init (char* cfg)
  {
    /* set start render mode */
    renderTile = renderTileStandard;

    /* create random bounding boxes */
	const size_t N = 200;
    const size_t extraSpace = 1000;
    
    avector<RTCBuildPrimitive> prims;
    prims.resize(N);

	/*	Create primitives	*/
    for (size_t i=0; i<N; i++)
    {
      const float x = float(drand48());
      const float y = float(drand48());
      const float z = float(drand48());
      const Vec3fa p = 1000.0f*Vec3fa(x,y,z);
      const BBox3fa b = BBox3fa(p,p+Vec3fa(1.0f));

      RTCBuildPrimitive prim;
      prim.lower_x = b.lower.x;
      prim.lower_y = b.lower.y;
      prim.lower_z = b.lower.z;
      prim.geomID = 0;
      prim.upper_x = b.upper.x;
      prim.upper_y = b.upper.y;
      prim.upper_z = b.upper.z;
      prim.primID = (unsigned) i;
      prims = prim;
    }

	/*	only want to test high quality build
    std::cout << "Low quality BVH build:" << std::endl;
    build(RTC_BUILD_QUALITY_LOW,prims,cfg);

    std::cout << "Normal quality BVH build:" << std::endl;
    build(RTC_BUILD_QUALITY_MEDIUM,prims,cfg);
	*/
    std::cout << "High quality BVH build:" << std::endl;
    build(RTC_BUILD_QUALITY_HIGH,prims,cfg,extraSpace);
  }

  /* task that renders a single screen tile */
  void renderTileStandard(int taskIndex, int threadIndex, int* pixels,
                          const unsigned int width,
                          const unsigned int height,
                          const float time,
                          const ISPCCamera& camera,
                          const int numTilesX,
                          const int numTilesY)
  {
  }

  /* task that renders a single screen tile */
  void renderTileTask(int taskIndex, int threadIndex, int* pixels,
                      const unsigned int width,
                      const unsigned int height,
                      const float time,
                      const ISPCCamera& camera,
                      const int numTilesX,
                      const int numTilesY)
  {
  }

  /* called by the C++ code to render */
  extern "C" void device_render (int* pixels,
                                 const int width,
                                 const int height,
                                 const float time,
                                 const ISPCCamera& camera)
  {
  }

  /* called by the C++ code for cleanup */
  extern "C" void device_cleanup () {
  }
}

 

-Evan

0 Kudos
BenthinC_Intel
Employee
2,101 Views

Hi Evan,

thanks for sharing the code. First, there's a little out of bounds access here:

InnerNode() {

...

bounds[4] = empty;

}

You are correct that not all inner nodes have always four valid children, therefore the sah() function should look something like this:

float sah() {
      float sum_area = 0.0f;
      BBox3fa merged_bounds(empty);
      for (size_t i=0;i<4;i++)
      {
        if (children == nullptr) break;
        sum_area += area(bounds)*children->sah();
        merged_bounds.extend(bounds);       
      }
      return 1.0f + sum_area / area(merged_bounds);
    }

Appropriate amount of extra space should be in 40-100% range with respect to the number of input primitives. Obviously there's a performance vs. memory trade off here.

sahBlockSize should be 4 for a 4-wide BVH and 8 for a 8-wide BVH.

Hope this helps.

C.

0 Kudos
Waxman__Evan
Beginner
2,101 Views

Oops, didn't catch that out of bounds error.

This really helps thank you!

-Evan

0 Kudos
Reply