Search this blog

29 July, 2013

Tiny HDR writer

It's really annoying to see the lack of simple (i.e. one function) writers of HDR image formats. I wrote my own Radiance HDR (RGB-exponent) and floating point TIFF writers. (forgive the sin of using std::string as a vector of bytes, was done as certain file write function in the framework used strings). 

They are rather ignorant as you will see from the source, rather slow too... But making them clever is left as an exercise to the reader, let's say this is really nothing more than documentation on the formats themselves, or better, on the minimal subset of the formats you need to know in order to write out HDR data.

Also you might want, as the TIFF routine writes raw float data, to convert it into and "inline" operation (i.e. BeginTiff, PushFloat, EndTiff kind of interface), which is simple enough especially if you move the IFD before the image data... Also, it would be much easier if it wrote the endian in the header based on your current platform file output order, making it easier than byte-by-byte writing as it is now.

UpdateAras Pranckevičius tweeted his EXR writer, so I was wrong, where was at least one simple HDR writer out there already. Also, EXR is more widespread than floating point TIF, and even easier... Partially related, Jon Olick has a neat single file JPEG and MPEG writers, handy (and I'm sure everybody knows about stb_image and image write, but just in case...)!


// http://paulbourke.net/dataformats/tiff/ and http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
// Not all programs support floating-point TIFFs, this was tested reading it back using Picturenaut and HDRShop
static std::string EncodeFloatTIFF(unsigned int wunsigned int hfloatRGBdataunsigned int floatsPerPixel = 4)
{
 assert(floatsPerPixel>=3); // we write only three floats (RGB) but support larger strides
 
 std::string outData;
 
 unsigned int image_size_bytes = w*h*3 * sizeof(float);
 outData.reserve(image_size_bytes + 500); // 500 is some slack for headers etc, I should compute it exactly... :)
 
 // Header
 outData.push_back(0x4d); outData.push_back(0x4d); // First two chars specify MM for big endian TODO - convert to little to make it easier on x86
 outData.push_back(0); outData.push_back(42); // Tiff version ID 
 
 unsigned int IFD_offset = 8 + image_size_bytes; // IFD table usually follows image
 outData.push_back((IFD_offset & 0xff000000) >> 24);
 outData.push_back((IFD_offset & 0xff0000) >> 16);
 outData.push_back((IFD_offset & 0xff00) >> 8);
 outData.push_back(IFD_offset & 0xff);
 
 // Image data
 for (unsigned int y=0; y<h; y++) 
 {
  for (unsigned int x=0; x<w; x++) 
  {
   unsigned int f = 0;
   for(; f< 3; f++,RGBdata++)
   {
    uint32_t floatAsInt = *reinterpret_cast<uint32_t*>(RGBdata);
    outData.push_back((floatAsInt & 0xff000000) >> 24);
    outData.push_back((floatAsInt & 0xff0000) >> 16);
    outData.push_back((floatAsInt & 0xff00) >> 8);
    outData.push_back(floatAsInt & 0xff);
   }
   for(; f<floatsPerPixel; f++)
    RGBdata++;
  }
 }
 
 // IFD Tags
 unsigned int NUM_IFD = 12;
 
 assert(outData.size() == IFD_offset);
 outData.push_back(0);
 outData.push_back(NUM_IFD); // Number of tags
 
 outData.push_back(1); outData.push_back(0); // -- width tag
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((w & 0xff00) >> 8); outData.push_back(w & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(1); // -- height tag
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((h & 0xff00) >> 8); outData.push_back(h & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(3); // -- compression
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // none
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(6); // -- photometric interpretation
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(2); // RGB
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x12); // -- orientation
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // 
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x15); // -- samples per pixel
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(3); // three samples (RGB)
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x16); // -- rows per strip
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((h & 0xff00) >> 8); outData.push_back(h & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x17); // -- strip byte count (total size)
 outData.push_back(0); outData.push_back(4); // long format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((image_size_bytes & 0xff000000) >> 24);
 outData.push_back((image_size_bytes & 0xff0000) >> 16);
 outData.push_back((image_size_bytes & 0xff00) >> 8);
 outData.push_back(image_size_bytes & 0xff);
 
 outData.push_back(1); outData.push_back(0x1c); // -- planar configuration
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // single image plane
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x11); // -- strip offset
 outData.push_back(0); outData.push_back(4); // long format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(8); // image starts right after the 8-byte header
 
 outData.push_back(1); outData.push_back(2); // -- bits per sample
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(3); // three values
 unsigned int BPS_offset = 8 + image_size_bytes + 2 + (NUM_IFD * 12) + 4; // offset to data (as data is > 4 bytes)
 outData.push_back((BPS_offset & 0xff000000) >> 24);
 outData.push_back((BPS_offset & 0xff0000) >> 16);
 outData.push_back((BPS_offset & 0xff00) >> 8);
 outData.push_back(BPS_offset & 0xff);
 
 outData.push_back(1); outData.push_back(0x53); // -- sample format
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(3); // three values
 unsigned int SF_offset = BPS_offset + 3*2; // offset to data (as data is > 4 bytes)
 outData.push_back((SF_offset & 0xff000000) >> 24);
 outData.push_back((SF_offset & 0xff0000) >> 16);
 outData.push_back((SF_offset & 0xff00) >> 8);
 outData.push_back(SF_offset & 0xff);
 
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(0); // IFD END
 
 // bits per sample data
 assert(outData.size() == BPS_offset);
 outData.push_back(0); outData.push_back(8*sizeof(float)); outData.push_back(0); outData.push_back(8*sizeof(float)); outData.push_back(0); outData.push_back(8*sizeof(float));
 
 // sample format data (1 = uint, 2 = sint, 3 = float)
 assert(outData.size() == SF_offset);
 outData.push_back(0); outData.push_back(3); outData.push_back(0); outData.push_back(3); outData.push_back(0); outData.push_back(3);
 
 return outData;
}

static std::string EncodeRadianceHDR(unsigned int wunsigned int hfloatRGBdataunsigned int floatsPerPixel = 4)
{
assert(floatsPerPixel >= 3); // we write only three floats (RGB) but support larger strides
// Key-Value pairs after RADIANCE are optional //const char header[] = "#?RADIANCE\nEXPOSURE=1\nGAMMA=2.2\nFORMAT=32-bit_rle_rgbe\n\n"; const char header[] = "#?RADIANCE\nFORMAT=32-bit_rle_rgbe\n\n"; std::string outData; outData.reserve(w*h*4 + sizeof(header) + 200); // 200 is some slack... std::vector<unsigned char> scanline[4]; scanline[0].resize(w); scanline[1].resize(w); scanline[2].resize(w); scanline[3].resize(w); outData.append(header, sizeof(header)-1); outData.append("-Y ", 3); outData += std::to_string(h); outData.append(" +X ", 4); outData += std::to_string(w); outData.push_back('\n'); for(unsigned int y=0 ; y<h; y++) { // RLE header // TODO looking at stb_image there seems to be also a non RLE line mode, which we should use as we don't really encode RLE here, // but I'm not sure that the way stb_image decodes the line header is standard-compliant... outData.push_back(2); outData.push_back(2); outData.push_back( (unsigned char)((w>>8) & 0xff) ); outData.push_back( (unsigned char)(w & 0xff) ); for(unsigned int x=0 ; x<w; x++) { unsigned char encodedPixel[4]; float r = RGBdata[0], g = RGBdata[1], b= RGBdata[2]; //r /= 179.0; g /= 179.0; b /= 179.0;   double maxV = r; if(maxV < g) maxV = g; if(maxV < b) maxV = b; if(maxV < std::numeric_limits<double>::epsilon()) { encodedPixel[0] = encodedPixel[1] = encodedPixel[2] = encodedPixel[3] = 0; } else { int e; maxV = frexp(maxV, &e) * 256.0/maxV; encodedPixel[0] = unsigned char(maxV * r); encodedPixel[1] = unsigned char(maxV * g); encodedPixel[2] = unsigned char(maxV * b); encodedPixel[3] = unsigned char(e + 128); } scanline[0][x] = encodedPixel[0]; scanline[1][x] = encodedPixel[1]; scanline[2][x] = encodedPixel[2]; scanline[3][x] = encodedPixel[3]; RGBdata += floatsPerPixel; } // For simplicity, write all as it was not RLE... for(unsigned int line=0; line < 4; line++) { auto scanIter = scanline[line].begin(); auto scanEnd = scanline[line].end(); while( scanIter < scanEnd ) { size_t remaining = scanEnd-scanIter; // the last bit in a char, if set, would indicate a RLE run, we want to avoid that unsigned char toWrite = remaining>127 ? 127 : (unsigned char)remaining;  outData.push_back(toWrite); // length of the "non run" data outData.append((char*)& scanIter[0], (size_t)toWrite); scanIter += (size_t)toWrite; } } } return outData; }

14 July, 2013

DX11: GPU "printf"

So, first a little "announcement": I'm crafting a small DX11 rendering framework in my spare time. I want to have it opensourced, and it's based on MJP's excellent SampleFramework11.
The goals are to provide an environment roughly as fast to iterate upon as FXComposer was (I consider it dead now...) but for programmers, without being a "shader editor".
If you're interested in collaborating, send me an email at c0de517e (it's a gmail account) with a brief introduction, there is an interesting list of things to do.

That said, this is a little bit of functionality Maurizio Cerrato and I have been working on in a couple of days, a "printf" like function for pixel (and compute) shaders. It all started when chatting Daniel Sewell (a brilliant guy, was my rendering lead on Fight Night) he made me notice that he found, working on CS that a neat way to debug them was to display all kinds of interesting debug visualizations by having geometry shaders "decode" buffers and emit lines.

if(IsDebuggedPixel(input.PositionSS.xy)) DebugDrawFloat(float2(ssao, bloom.x), clipPos);
The astute readers will at this point have already all figured it out. PS and CS support append buffers, so a "printf" has only to append some data to a buffer that later you can convert to lines in a geometry shader.

You could emit such data per each PS invocation and later sift through it and display what you needed in a meaningful way, but that will be quite slow (and at that point you might want to consider just packing everything into some MRT outputs). The idea behind appendbuffers is to do the work only for a handful of invocations (e.g. screen positions, if current sv_position equals the pixel to "debug" then GPU printf...).

In order to keep everything snappy we also minimize the structure size we use in the append buffer, you can't really printf strings, the debugger so far support only one to three floats w/color and position or lines. Lines is were we started really, our struct containts two end-points a color (index) and a flag which distinguishes lines from float printf. Floats just reinterpret one of the endpoints as the data to print.

This append buffer structure gets then fed to a VS/GS that is invocated twice the times the append buffer count (via draw indirect, you need to multiply by two the count in a small CS, remember, you can't emit the start/end vertices as two separate append calls because the order of these is not deterministic, the vertices will end all mixed in the buffer!), and the GS emits extra lines if we're priting floats to display a small line-based font.

If you're thinking that is lame, well it is, there are certain limitations in the number of primitives the GS can emit that effectively limit the number of digits you can display, and you have to be careful about that, I "optimized" the code to display the most digits possible which unfortunately gives you very low-precision 3-float printf and higher precision 2-float and 1-float (you could though call three times the 1-float version... as there the ordering of the three call doesn't matter).

Keeping the same number of printed digits, the point has to float...
Why not using a bitmap font instead? Glad you asked. Lazyness, partially justified by the fact that I didn't want to have two different append buffers, one for lines and one for fonts, as the append buffers are a scarce resource on DX11. But it's a very lame justification, because there are plenty of workarounds left for the reader, you could filter the append buffer in two drawcalls in a computer shader, or even draw lines as quads, which would probably be better anyways!

Anyhow, together with shader hot-reloading (which everybody has, right), this is a quite a handy trick. Bonus: on a similar note, have a look at this shadertoy snippet by my coworker Paul Malin... brilliant guy!

Some code, without doubt full of bugs:

Snippet from the CPU/C++ side, drawing the debug lines...
void ShaderDebugDraw(ID3D11DeviceContextcontextconst Float4x4viewProjectionMatrixconst Float4x4projMatrix )
{
    SampleFramework11::PIXEvent market(L"ShaderDebug Draw");
 
    context->CopyStructureCount(AppendBufferCountCopy, 0, AppendBuffer.UAView);
 
    // We need a compute shader to write BufferCountUAV, as we need to multiply CopyStructureCount by two 
    ID3D11ShaderResourceView* srViews[] = { AppendBuffer.SRView };
    ID3D11UnorderedAccessView* uaViews[] = { AppendBufferCountCopyUAV };
    UINT uavsCount[] = { 0 };
    context->CSSetUnorderedAccessViews(1, 1, uaViews, uavsCount);
    context->CSSetShader(DebugDrawShader.AcquireCS(), NULL, 0);
    context->Dispatch(1,1,1);
    context->CSSetShader(NULLNULL, 0);
    uaViews[0] = NULL;
    context->CSSetUnorderedAccessViews(1, 1, uaViews, uavsCount);
 
    // Set all IA stage inputs to NULL, since we're not using it at all.
    void* nulls[D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT] = { NULL };
 
    context->IASetVertexBuffers(0, D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT, (ID3D11Buffer**)nulls, (UINT*)nulls, (UINT*)nulls);
    context->IASetInputLayout(NULL);
    context->IASetIndexBuffer(NULLDXGI_FORMAT_UNKNOWN, 0);
 
    // Draw debug lines
    srViews[0] =  AppendBuffer.SRView;
    context->VSSetShaderResources(0, 1, srViews);
    context->GSSetShaderResources(0, 1, srViews);
    context->GSSetShader(DebugDrawShader.AcquireGS(), NULL, 0);  
    context->VSSetShader(DebugDrawShader.AcquireVS(), NULL, 0);
    context->PSSetShader(DebugDrawShader.AcquirePS(), NULL, 0);
 
    shaderDebugDrawDataVS.Data.ViewProjection = viewProjectionMatrix;
    shaderDebugDrawDataVS.Data.Projection = projMatrix;
    shaderDebugDrawDataVS.ApplyChanges(context);
    shaderDebugDrawDataVS.SetVS(context, 0);
 
    context->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST);
    context->DrawInstancedIndirect(AppendBufferCountCopy, 0);
[...]

This is roughly how the shader library looks for emitting debug lines/debug numbers from pixel shaders
struct ShaderDebugLine
{
 float3 posStart;
 float3 posEnd;
 uint color;
 uint flag;
};
 
cbuffer ShaderDebugData : register(b13)
{
 float2 debugPixelCoords;
 float2 oneOverDisplaySize;
 int debugType;
};
void DebugDrawFloat(float3 number,  float3 pos, int color = 0, uint spaceFlag = SHADER_DEBUG_FLAG_2D)
{
 ShaderDebugLine l;
 l.posStart = pos;
 l.color = color;
 l.posEnd = number;
 l.flag = SHADER_DEBUG_PRIM_FLOAT3|spaceFlag;
 ShaderDebugAppendBuffer.Append(l);
}
float2 SVPosToClipspace(float2 svPos, float2 oneOverDisplaySize) { return (svPos * oneOverDisplaySize) * float2(2,-2) + float2(-1,1); }
 
bool IsDebuggedPixel(float2 svPos)
{
 // This is a bit tricky because it depends on the MSAA pattern
 
 if(debugType == 1)
  return dot(abs(debugPixelCoords - svPos + float2(0.5,0.5)), 1.0.xx) <= 0.01f;
 else if(debugType == 2)
  return dot(abs(svPos % float2(100,100)), 1.0.xx) <= 1.01f;
 else return false;
}

And finally, the VS/GS/CS shaders needed to draw the debug buffer emitted from the various PS executions:
static const int DigitFontOffsets[] =
{
 0, 8, 10, 20, 30, 38, 48, 58, 62, 72, 82, 84, 86
};
 
static const float DigitFontScaling = 0.03;
static const float DigitFontWidth = 0.7 * DigitFontScaling; // The font width is 0.5, but we add spacing
static const int DigitFontMaxLinesPerDigit = 5;
static const float2 DigitFont[] =
{
 /* 0 */
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -1.f), float2(0.f, -1.f), float2(0.f, -1.f), float2(0.f, 0.f),
 /*1*/
 float2(0.5f, 0.f), float2(0.5f, -1.f),
 /*2*/
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.f, -1.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*3*/
 float2(0.f, 0.f), float2(0.5f,0.f), float2(0.5f,0.f), float2(0.5f,-0.5f),
 float2(0.5f,-0.5f), float2(0.f,-0.5f), float2(0.5f,-0.5f), float2(0.5f,-1.f),
 float2(0.5f,-1.f), float2(0.f,-1.f),
 /*4*/
 float2(0.f, 0.f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, 0.f), float2(0.5f, -0.5f), float2(0.5f, -1.f),
 /*5*/
 float2(0.f, 0.f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, -1.f), float2(0.f, 0.f), float2(0.5f, 0.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*6*/
 float2(0.f, 0.f), float2(0.f, -1.f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, -1.f), /* avoidable */ float2(0.f, 0.f), float2(0.5f, 0.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*7*/
 float2(0.5f, 0.f), float2(0.5f, -1.f), float2(0.5f, 0.f), float2(0.f, 0.f),
 /* 8 */
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -1.f), float2(0.f, -1.f), float2(0.f, -1.f), float2(0.f, 0.f),
 float2(0.f, -0.5f), float2(0.5f, -0.5f),
 /*9*/
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -0.5f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.f, 0.f),
 float2(0.5f, -1.f), float2(0.f, -1.f),
 /*-*/
 float2(0.5f, -0.5f), float2(0.f, -0.5f),    
 /*.*/
 float2(0.8f, -0.9f), float2(0.9f, -1.f),
};
 
cbuffer ShaderDebugDrawData : register(b0)
{
 float4x4 Projection;
 float4x4 ViewProjection;
};
 
struct vsOut
{
 float4 Pos : SV_Position;
 float3 Color : TexCoord0;
};
 
StructuredBuffer ShaderDebugStructuredBuffer : register(u0);
RWBuffer<uint> StructureCount : register(u1);
 
void DebugDrawDigit(int digit, float4 pos, inout LineStream GS_Out, float3 color)
{  
 for (int i = DigitFontOffsets[digit]; i < DigitFontOffsets[digit+1] - 1; i+=2)
 {
  vsOut p;
  p.Color = color;
 
  p.Pos = pos + float4(DigitFont[i] * DigitFontScaling, 0, 0);
  GS_Out.Append(p);
 
  p.Pos = pos + float4(DigitFont[i +1] * DigitFontScaling, 0, 0);
  GS_Out.Append(p);
 
  GS_Out.RestartStrip();
 }
}
 
float4 DebugDrawIntGS(int numberAbs, uint numdigit, float4 pos, inout LineStream GS_Out, float3 color)
{
 while(numdigit > 0)
 {
  DebugDrawDigit(numberAbs % 10u , pos, GS_Out, color);
  numberAbs /= 10u;
  --numdigit;
  pos.x -= DigitFontWidth;
 }
 
 return pos;
}
 
void DebugDrawFloatHelperGS(float number, float4 pos, inout LineStream GS_Out, float3 color, int totalDigits)
{
 float numberAbs = abs(number);
 uint intPart = (int)numberAbs; 
 uint intDigits = 0;
 
 if(intPart > 0)
  intDigits = (uint) log10 ((float) intPart) + 1;
 
 uint fractDigits = max(0, totalDigits - intDigits);
 
 // Get the fractional part 
 uint fractPart = round(frac(numberAbs) * pow(10, (fractDigits-1)));
 
 // Draw the fractional part
 pos = DebugDrawIntGS(fractPart, fractDigits, pos, GS_Out, color * 0.5 /* make fractional part darker */);
 
 // Draw the .
 pos.x -= DigitFontWidth * 0.5;
 DebugDrawDigit(11, pos, GS_Out, color);
 pos.x += DigitFontWidth * 0.25;
 
 // Draw the int part
 if (numberAbs > 0)
 {
  pos = DebugDrawIntGS(intPart, intDigits, pos, GS_Out, color);
  if (number < 0)
   DebugDrawDigit(10 /* draw a minus sign */, pos, GS_Out, color);
 }
}
 
vsOut VS(uint VertexID : SV_VertexID)
{
 uint index = VertexID/2;
 
 uint col = ShaderDebugStructuredBuffer[index].color;
 uint flags = ShaderDebugStructuredBuffer[index].flag;
 
 float3 pos;
 if((VertexID & 1)==0) // we're processing the start of the line
  pos = ShaderDebugStructuredBuffer[index].posStart;
 else // we're processing the start of the line
  pos = ShaderDebugStructuredBuffer[index].posEnd;
 
 vsOut output = (vsOut)0;
 output.Color = ShaderDebugColors[col];
 
 if(flags & SHADER_DEBUG_FLAG_2D)
  output.Pos = float4(pos.xy,0,1);
 else if (flags & SHADER_DEBUG_FLAG_3D_VIEWSPACE)
  output.Pos = mul( float4(pos.xyz,1.0) , Projection);
 else // we just assume SHADER_DEBUG_FLAG_3D_WORLDSPACE otherwise
  output.Pos = mul( float4(pos.xyz,1.0) , ViewProjection);
 
 return output;
}
 
[numthreads(1,1,1)]
void CS(uint3 id : SV_DispatchThreadID)
{
  StructureCount[0] *= 2;
  StructureCount[1] = 1;
  StructureCount[2] = 0;
  StructureCount[3] = 0; 
}
 
float4 PS(vsOut input) : SV_Target0
{
 return float4(input.Color, 1.0f);
}
 
// Worst case we print 3 floats... 4 digits per float plus we need 4 vertices for the . and -, and another four 4 for the cross
[maxvertexcount(3 * (4*(2*DigitFontMaxLinesPerDigit)+4) + 4)]
void GS(line vsOut gin[2], inout LineStream GS_Out, uint PrimitiveID : SV_PrimitiveID)
{
 // We'll get two vertices, one primitive, out of the VS for each element in ShaderDebugStructuredBuffer...
 // TODO: we could avoid reading ShaderDebugStructuredBuffer if we passed the number flag along from the VS
 ShaderDebugLine dbgLine = ShaderDebugStructuredBuffer[PrimitiveID];
 
 // If we got a line, then just re-emit the line coordinates
 if((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_LINE)
 {
  GS_Out.Append(gin[0]);
  GS_Out.Append(gin[1]);
  GS_Out.RestartStrip();
 
  return;
 }
 
 float4 pos = gin[0].Pos;
 
 // Draw cross
 vsOut p;
 p.Color = gin[0].Color;
 
 p.Pos = pos + float4(DigitFontWidth*0.5,0,0,0);
 GS_Out.Append(p);
 p.Pos = pos + float4(-DigitFontWidth*0.5,0,0,0);
 GS_Out.Append(p);
 GS_Out.RestartStrip();
 
 p.Pos = pos + float4(0,DigitFontWidth*0.5,0,0);
 GS_Out.Append(p);
 p.Pos = pos + float4(0,-DigitFontWidth*0.5,0,0);
 GS_Out.Append(p);
 GS_Out.RestartStrip();
 
 // Draw the numbers, as lines
 pos += float4(0,-DigitFontWidth*1.5,0,0);
 float3 number = gin[1].Pos.xyz;
 
 if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT1)
 {
  // Less floats drawn means we can afford more precision without exceeding maxvertexcount
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 12);
 }
 else if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT2) 
 {
  // Less floats drawn means we can afford more precision without exceeding maxvertexcount, 12/2 = 6 digits
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 6);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.y, pos, GS_Out, gin[0].Color, 6);
 }
 else //if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT3)
 {
  // 3*4 we draw 12 digits here...
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 4);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.y, pos, GS_Out, gin[0].Color, 4);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.z, pos, GS_Out, gin[0].Color, 4);
 }
}