Search this blog

29 July, 2013

Tiny HDR writer

It's really annoying to see the lack of simple (i.e. one function) writers of HDR image formats. I wrote my own Radiance HDR (RGB-exponent) and floating point TIFF writers. (forgive the sin of using std::string as a vector of bytes, was done as certain file write function in the framework used strings). 

They are rather ignorant as you will see from the source, rather slow too... But making them clever is left as an exercise to the reader, let's say this is really nothing more than documentation on the formats themselves, or better, on the minimal subset of the formats you need to know in order to write out HDR data.

Also you might want, as the TIFF routine writes raw float data, to convert it into and "inline" operation (i.e. BeginTiff, PushFloat, EndTiff kind of interface), which is simple enough especially if you move the IFD before the image data... Also, it would be much easier if it wrote the endian in the header based on your current platform file output order, making it easier than byte-by-byte writing as it is now.

UpdateAras Pranckevičius tweeted his EXR writer, so I was wrong, where was at least one simple HDR writer out there already. Also, EXR is more widespread than floating point TIF, and even easier... Partially related, Jon Olick has a neat single file JPEG and MPEG writers, handy (and I'm sure everybody knows about stb_image and image write, but just in case...)!


// http://paulbourke.net/dataformats/tiff/ and http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
// Not all programs support floating-point TIFFs, this was tested reading it back using Picturenaut and HDRShop
static std::string EncodeFloatTIFF(unsigned int wunsigned int hfloatRGBdataunsigned int floatsPerPixel = 4)
{
 assert(floatsPerPixel>=3); // we write only three floats (RGB) but support larger strides
 
 std::string outData;
 
 unsigned int image_size_bytes = w*h*3 * sizeof(float);
 outData.reserve(image_size_bytes + 500); // 500 is some slack for headers etc, I should compute it exactly... :)
 
 // Header
 outData.push_back(0x4d); outData.push_back(0x4d); // First two chars specify MM for big endian TODO - convert to little to make it easier on x86
 outData.push_back(0); outData.push_back(42); // Tiff version ID 
 
 unsigned int IFD_offset = 8 + image_size_bytes; // IFD table usually follows image
 outData.push_back((IFD_offset & 0xff000000) >> 24);
 outData.push_back((IFD_offset & 0xff0000) >> 16);
 outData.push_back((IFD_offset & 0xff00) >> 8);
 outData.push_back(IFD_offset & 0xff);
 
 // Image data
 for (unsigned int y=0; y<h; y++) 
 {
  for (unsigned int x=0; x<w; x++) 
  {
   unsigned int f = 0;
   for(; f< 3; f++,RGBdata++)
   {
    uint32_t floatAsInt = *reinterpret_cast<uint32_t*>(RGBdata);
    outData.push_back((floatAsInt & 0xff000000) >> 24);
    outData.push_back((floatAsInt & 0xff0000) >> 16);
    outData.push_back((floatAsInt & 0xff00) >> 8);
    outData.push_back(floatAsInt & 0xff);
   }
   for(; f<floatsPerPixel; f++)
    RGBdata++;
  }
 }
 
 // IFD Tags
 unsigned int NUM_IFD = 12;
 
 assert(outData.size() == IFD_offset);
 outData.push_back(0);
 outData.push_back(NUM_IFD); // Number of tags
 
 outData.push_back(1); outData.push_back(0); // -- width tag
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((w & 0xff00) >> 8); outData.push_back(w & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(1); // -- height tag
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((h & 0xff00) >> 8); outData.push_back(h & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(3); // -- compression
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // none
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(6); // -- photometric interpretation
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(2); // RGB
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x12); // -- orientation
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // 
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x15); // -- samples per pixel
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(3); // three samples (RGB)
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x16); // -- rows per strip
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((h & 0xff00) >> 8); outData.push_back(h & 0xff); // value
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x17); // -- strip byte count (total size)
 outData.push_back(0); outData.push_back(4); // long format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back((image_size_bytes & 0xff000000) >> 24);
 outData.push_back((image_size_bytes & 0xff0000) >> 16);
 outData.push_back((image_size_bytes & 0xff00) >> 8);
 outData.push_back(image_size_bytes & 0xff);
 
 outData.push_back(1); outData.push_back(0x1c); // -- planar configuration
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(1); // single image plane
 outData.push_back(0); outData.push_back(0); // padding (as we specified short value)
 
 outData.push_back(1); outData.push_back(0x11); // -- strip offset
 outData.push_back(0); outData.push_back(4); // long format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(1); // single value
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(8); // image starts right after the 8-byte header
 
 outData.push_back(1); outData.push_back(2); // -- bits per sample
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(3); // three values
 unsigned int BPS_offset = 8 + image_size_bytes + 2 + (NUM_IFD * 12) + 4; // offset to data (as data is > 4 bytes)
 outData.push_back((BPS_offset & 0xff000000) >> 24);
 outData.push_back((BPS_offset & 0xff0000) >> 16);
 outData.push_back((BPS_offset & 0xff00) >> 8);
 outData.push_back(BPS_offset & 0xff);
 
 outData.push_back(1); outData.push_back(0x53); // -- sample format
 outData.push_back(0); outData.push_back(3); // short format
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(3); // three values
 unsigned int SF_offset = BPS_offset + 3*2; // offset to data (as data is > 4 bytes)
 outData.push_back((SF_offset & 0xff000000) >> 24);
 outData.push_back((SF_offset & 0xff0000) >> 16);
 outData.push_back((SF_offset & 0xff00) >> 8);
 outData.push_back(SF_offset & 0xff);
 
 outData.push_back(0); outData.push_back(0); outData.push_back(0); outData.push_back(0); // IFD END
 
 // bits per sample data
 assert(outData.size() == BPS_offset);
 outData.push_back(0); outData.push_back(8*sizeof(float)); outData.push_back(0); outData.push_back(8*sizeof(float)); outData.push_back(0); outData.push_back(8*sizeof(float));
 
 // sample format data (1 = uint, 2 = sint, 3 = float)
 assert(outData.size() == SF_offset);
 outData.push_back(0); outData.push_back(3); outData.push_back(0); outData.push_back(3); outData.push_back(0); outData.push_back(3);
 
 return outData;
}

static std::string EncodeRadianceHDR(unsigned int wunsigned int hfloatRGBdataunsigned int floatsPerPixel = 4)
{
assert(floatsPerPixel >= 3); // we write only three floats (RGB) but support larger strides
// Key-Value pairs after RADIANCE are optional //const char header[] = "#?RADIANCE\nEXPOSURE=1\nGAMMA=2.2\nFORMAT=32-bit_rle_rgbe\n\n"; const char header[] = "#?RADIANCE\nFORMAT=32-bit_rle_rgbe\n\n"; std::string outData; outData.reserve(w*h*4 + sizeof(header) + 200); // 200 is some slack... std::vector<unsigned char> scanline[4]; scanline[0].resize(w); scanline[1].resize(w); scanline[2].resize(w); scanline[3].resize(w); outData.append(header, sizeof(header)-1); outData.append("-Y ", 3); outData += std::to_string(h); outData.append(" +X ", 4); outData += std::to_string(w); outData.push_back('\n'); for(unsigned int y=0 ; y<h; y++) { // RLE header // TODO looking at stb_image there seems to be also a non RLE line mode, which we should use as we don't really encode RLE here, // but I'm not sure that the way stb_image decodes the line header is standard-compliant... outData.push_back(2); outData.push_back(2); outData.push_back( (unsigned char)((w>>8) & 0xff) ); outData.push_back( (unsigned char)(w & 0xff) ); for(unsigned int x=0 ; x<w; x++) { unsigned char encodedPixel[4]; float r = RGBdata[0], g = RGBdata[1], b= RGBdata[2]; //r /= 179.0; g /= 179.0; b /= 179.0;   double maxV = r; if(maxV < g) maxV = g; if(maxV < b) maxV = b; if(maxV < std::numeric_limits<double>::epsilon()) { encodedPixel[0] = encodedPixel[1] = encodedPixel[2] = encodedPixel[3] = 0; } else { int e; maxV = frexp(maxV, &e) * 256.0/maxV; encodedPixel[0] = unsigned char(maxV * r); encodedPixel[1] = unsigned char(maxV * g); encodedPixel[2] = unsigned char(maxV * b); encodedPixel[3] = unsigned char(e + 128); } scanline[0][x] = encodedPixel[0]; scanline[1][x] = encodedPixel[1]; scanline[2][x] = encodedPixel[2]; scanline[3][x] = encodedPixel[3]; RGBdata += floatsPerPixel; } // For simplicity, write all as it was not RLE... for(unsigned int line=0; line < 4; line++) { auto scanIter = scanline[line].begin(); auto scanEnd = scanline[line].end(); while( scanIter < scanEnd ) { size_t remaining = scanEnd-scanIter; // the last bit in a char, if set, would indicate a RLE run, we want to avoid that unsigned char toWrite = remaining>127 ? 127 : (unsigned char)remaining;  outData.push_back(toWrite); // length of the "non run" data outData.append((char*)& scanIter[0], (size_t)toWrite); scanIter += (size_t)toWrite; } } } return outData; }

14 July, 2013

DX11: GPU "printf"

So, first a little "announcement": I'm crafting a small DX11 rendering framework in my spare time. I want to have it opensourced, and it's based on MJP's excellent SampleFramework11.
The goals are to provide an environment roughly as fast to iterate upon as FXComposer was (I consider it dead now...) but for programmers, without being a "shader editor".
If you're interested in collaborating, send me an email at c0de517e (it's a gmail account) with a brief introduction, there is an interesting list of things to do.

That said, this is a little bit of functionality Maurizio Cerrato and I have been working on in a couple of days, a "printf" like function for pixel (and compute) shaders. It all started when chatting Daniel Sewell (a brilliant guy, was my rendering lead on Fight Night) he made me notice that he found, working on CS that a neat way to debug them was to display all kinds of interesting debug visualizations by having geometry shaders "decode" buffers and emit lines.

if(IsDebuggedPixel(input.PositionSS.xy)) DebugDrawFloat(float2(ssao, bloom.x), clipPos);
The astute readers will at this point have already all figured it out. PS and CS support append buffers, so a "printf" has only to append some data to a buffer that later you can convert to lines in a geometry shader.

You could emit such data per each PS invocation and later sift through it and display what you needed in a meaningful way, but that will be quite slow (and at that point you might want to consider just packing everything into some MRT outputs). The idea behind appendbuffers is to do the work only for a handful of invocations (e.g. screen positions, if current sv_position equals the pixel to "debug" then GPU printf...).

In order to keep everything snappy we also minimize the structure size we use in the append buffer, you can't really printf strings, the debugger so far support only one to three floats w/color and position or lines. Lines is were we started really, our struct containts two end-points a color (index) and a flag which distinguishes lines from float printf. Floats just reinterpret one of the endpoints as the data to print.

This append buffer structure gets then fed to a VS/GS that is invocated twice the times the append buffer count (via draw indirect, you need to multiply by two the count in a small CS, remember, you can't emit the start/end vertices as two separate append calls because the order of these is not deterministic, the vertices will end all mixed in the buffer!), and the GS emits extra lines if we're priting floats to display a small line-based font.

If you're thinking that is lame, well it is, there are certain limitations in the number of primitives the GS can emit that effectively limit the number of digits you can display, and you have to be careful about that, I "optimized" the code to display the most digits possible which unfortunately gives you very low-precision 3-float printf and higher precision 2-float and 1-float (you could though call three times the 1-float version... as there the ordering of the three call doesn't matter).

Keeping the same number of printed digits, the point has to float...
Why not using a bitmap font instead? Glad you asked. Lazyness, partially justified by the fact that I didn't want to have two different append buffers, one for lines and one for fonts, as the append buffers are a scarce resource on DX11. But it's a very lame justification, because there are plenty of workarounds left for the reader, you could filter the append buffer in two drawcalls in a computer shader, or even draw lines as quads, which would probably be better anyways!

Anyhow, together with shader hot-reloading (which everybody has, right), this is a quite a handy trick. Bonus: on a similar note, have a look at this shadertoy snippet by my coworker Paul Malin... brilliant guy!

Some code, without doubt full of bugs:

Snippet from the CPU/C++ side, drawing the debug lines...
void ShaderDebugDraw(ID3D11DeviceContextcontextconst Float4x4viewProjectionMatrixconst Float4x4projMatrix )
{
    SampleFramework11::PIXEvent market(L"ShaderDebug Draw");
 
    context->CopyStructureCount(AppendBufferCountCopy, 0, AppendBuffer.UAView);
 
    // We need a compute shader to write BufferCountUAV, as we need to multiply CopyStructureCount by two 
    ID3D11ShaderResourceView* srViews[] = { AppendBuffer.SRView };
    ID3D11UnorderedAccessView* uaViews[] = { AppendBufferCountCopyUAV };
    UINT uavsCount[] = { 0 };
    context->CSSetUnorderedAccessViews(1, 1, uaViews, uavsCount);
    context->CSSetShader(DebugDrawShader.AcquireCS(), NULL, 0);
    context->Dispatch(1,1,1);
    context->CSSetShader(NULLNULL, 0);
    uaViews[0] = NULL;
    context->CSSetUnorderedAccessViews(1, 1, uaViews, uavsCount);
 
    // Set all IA stage inputs to NULL, since we're not using it at all.
    void* nulls[D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT] = { NULL };
 
    context->IASetVertexBuffers(0, D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT, (ID3D11Buffer**)nulls, (UINT*)nulls, (UINT*)nulls);
    context->IASetInputLayout(NULL);
    context->IASetIndexBuffer(NULLDXGI_FORMAT_UNKNOWN, 0);
 
    // Draw debug lines
    srViews[0] =  AppendBuffer.SRView;
    context->VSSetShaderResources(0, 1, srViews);
    context->GSSetShaderResources(0, 1, srViews);
    context->GSSetShader(DebugDrawShader.AcquireGS(), NULL, 0);  
    context->VSSetShader(DebugDrawShader.AcquireVS(), NULL, 0);
    context->PSSetShader(DebugDrawShader.AcquirePS(), NULL, 0);
 
    shaderDebugDrawDataVS.Data.ViewProjection = viewProjectionMatrix;
    shaderDebugDrawDataVS.Data.Projection = projMatrix;
    shaderDebugDrawDataVS.ApplyChanges(context);
    shaderDebugDrawDataVS.SetVS(context, 0);
 
    context->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST);
    context->DrawInstancedIndirect(AppendBufferCountCopy, 0);
[...]

This is roughly how the shader library looks for emitting debug lines/debug numbers from pixel shaders
struct ShaderDebugLine
{
 float3 posStart;
 float3 posEnd;
 uint color;
 uint flag;
};
 
cbuffer ShaderDebugData : register(b13)
{
 float2 debugPixelCoords;
 float2 oneOverDisplaySize;
 int debugType;
};
void DebugDrawFloat(float3 number,  float3 pos, int color = 0, uint spaceFlag = SHADER_DEBUG_FLAG_2D)
{
 ShaderDebugLine l;
 l.posStart = pos;
 l.color = color;
 l.posEnd = number;
 l.flag = SHADER_DEBUG_PRIM_FLOAT3|spaceFlag;
 ShaderDebugAppendBuffer.Append(l);
}
float2 SVPosToClipspace(float2 svPos, float2 oneOverDisplaySize) { return (svPos * oneOverDisplaySize) * float2(2,-2) + float2(-1,1); }
 
bool IsDebuggedPixel(float2 svPos)
{
 // This is a bit tricky because it depends on the MSAA pattern
 
 if(debugType == 1)
  return dot(abs(debugPixelCoords - svPos + float2(0.5,0.5)), 1.0.xx) <= 0.01f;
 else if(debugType == 2)
  return dot(abs(svPos % float2(100,100)), 1.0.xx) <= 1.01f;
 else return false;
}

And finally, the VS/GS/CS shaders needed to draw the debug buffer emitted from the various PS executions:
static const int DigitFontOffsets[] =
{
 0, 8, 10, 20, 30, 38, 48, 58, 62, 72, 82, 84, 86
};
 
static const float DigitFontScaling = 0.03;
static const float DigitFontWidth = 0.7 * DigitFontScaling; // The font width is 0.5, but we add spacing
static const int DigitFontMaxLinesPerDigit = 5;
static const float2 DigitFont[] =
{
 /* 0 */
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -1.f), float2(0.f, -1.f), float2(0.f, -1.f), float2(0.f, 0.f),
 /*1*/
 float2(0.5f, 0.f), float2(0.5f, -1.f),
 /*2*/
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.f, -1.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*3*/
 float2(0.f, 0.f), float2(0.5f,0.f), float2(0.5f,0.f), float2(0.5f,-0.5f),
 float2(0.5f,-0.5f), float2(0.f,-0.5f), float2(0.5f,-0.5f), float2(0.5f,-1.f),
 float2(0.5f,-1.f), float2(0.f,-1.f),
 /*4*/
 float2(0.f, 0.f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, 0.f), float2(0.5f, -0.5f), float2(0.5f, -1.f),
 /*5*/
 float2(0.f, 0.f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, -1.f), float2(0.f, 0.f), float2(0.5f, 0.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*6*/
 float2(0.f, 0.f), float2(0.f, -1.f), float2(0.f, -0.5f), float2(0.5f, -0.5f),
 float2(0.5f, -0.5f), float2(0.5f, -1.f), /* avoidable */ float2(0.f, 0.f), float2(0.5f, 0.f),
 float2(0.f, -1.f), float2(0.5f, -1.f),
 /*7*/
 float2(0.5f, 0.f), float2(0.5f, -1.f), float2(0.5f, 0.f), float2(0.f, 0.f),
 /* 8 */
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -1.f), float2(0.f, -1.f), float2(0.f, -1.f), float2(0.f, 0.f),
 float2(0.f, -0.5f), float2(0.5f, -0.5f),
 /*9*/
 float2(0.f, 0.f), float2(0.5f, 0.f), float2(0.5f, 0.f), float2(0.5f, -1.f),
 float2(0.5f, -0.5f), float2(0.f, -0.5f), float2(0.f, -0.5f), float2(0.f, 0.f),
 float2(0.5f, -1.f), float2(0.f, -1.f),
 /*-*/
 float2(0.5f, -0.5f), float2(0.f, -0.5f),    
 /*.*/
 float2(0.8f, -0.9f), float2(0.9f, -1.f),
};
 
cbuffer ShaderDebugDrawData : register(b0)
{
 float4x4 Projection;
 float4x4 ViewProjection;
};
 
struct vsOut
{
 float4 Pos : SV_Position;
 float3 Color : TexCoord0;
};
 
StructuredBuffer ShaderDebugStructuredBuffer : register(u0);
RWBuffer<uint> StructureCount : register(u1);
 
void DebugDrawDigit(int digit, float4 pos, inout LineStream GS_Out, float3 color)
{  
 for (int i = DigitFontOffsets[digit]; i < DigitFontOffsets[digit+1] - 1; i+=2)
 {
  vsOut p;
  p.Color = color;
 
  p.Pos = pos + float4(DigitFont[i] * DigitFontScaling, 0, 0);
  GS_Out.Append(p);
 
  p.Pos = pos + float4(DigitFont[i +1] * DigitFontScaling, 0, 0);
  GS_Out.Append(p);
 
  GS_Out.RestartStrip();
 }
}
 
float4 DebugDrawIntGS(int numberAbs, uint numdigit, float4 pos, inout LineStream GS_Out, float3 color)
{
 while(numdigit > 0)
 {
  DebugDrawDigit(numberAbs % 10u , pos, GS_Out, color);
  numberAbs /= 10u;
  --numdigit;
  pos.x -= DigitFontWidth;
 }
 
 return pos;
}
 
void DebugDrawFloatHelperGS(float number, float4 pos, inout LineStream GS_Out, float3 color, int totalDigits)
{
 float numberAbs = abs(number);
 uint intPart = (int)numberAbs; 
 uint intDigits = 0;
 
 if(intPart > 0)
  intDigits = (uint) log10 ((float) intPart) + 1;
 
 uint fractDigits = max(0, totalDigits - intDigits);
 
 // Get the fractional part 
 uint fractPart = round(frac(numberAbs) * pow(10, (fractDigits-1)));
 
 // Draw the fractional part
 pos = DebugDrawIntGS(fractPart, fractDigits, pos, GS_Out, color * 0.5 /* make fractional part darker */);
 
 // Draw the .
 pos.x -= DigitFontWidth * 0.5;
 DebugDrawDigit(11, pos, GS_Out, color);
 pos.x += DigitFontWidth * 0.25;
 
 // Draw the int part
 if (numberAbs > 0)
 {
  pos = DebugDrawIntGS(intPart, intDigits, pos, GS_Out, color);
  if (number < 0)
   DebugDrawDigit(10 /* draw a minus sign */, pos, GS_Out, color);
 }
}
 
vsOut VS(uint VertexID : SV_VertexID)
{
 uint index = VertexID/2;
 
 uint col = ShaderDebugStructuredBuffer[index].color;
 uint flags = ShaderDebugStructuredBuffer[index].flag;
 
 float3 pos;
 if((VertexID & 1)==0) // we're processing the start of the line
  pos = ShaderDebugStructuredBuffer[index].posStart;
 else // we're processing the start of the line
  pos = ShaderDebugStructuredBuffer[index].posEnd;
 
 vsOut output = (vsOut)0;
 output.Color = ShaderDebugColors[col];
 
 if(flags & SHADER_DEBUG_FLAG_2D)
  output.Pos = float4(pos.xy,0,1);
 else if (flags & SHADER_DEBUG_FLAG_3D_VIEWSPACE)
  output.Pos = mul( float4(pos.xyz,1.0) , Projection);
 else // we just assume SHADER_DEBUG_FLAG_3D_WORLDSPACE otherwise
  output.Pos = mul( float4(pos.xyz,1.0) , ViewProjection);
 
 return output;
}
 
[numthreads(1,1,1)]
void CS(uint3 id : SV_DispatchThreadID)
{
  StructureCount[0] *= 2;
  StructureCount[1] = 1;
  StructureCount[2] = 0;
  StructureCount[3] = 0; 
}
 
float4 PS(vsOut input) : SV_Target0
{
 return float4(input.Color, 1.0f);
}
 
// Worst case we print 3 floats... 4 digits per float plus we need 4 vertices for the . and -, and another four 4 for the cross
[maxvertexcount(3 * (4*(2*DigitFontMaxLinesPerDigit)+4) + 4)]
void GS(line vsOut gin[2], inout LineStream GS_Out, uint PrimitiveID : SV_PrimitiveID)
{
 // We'll get two vertices, one primitive, out of the VS for each element in ShaderDebugStructuredBuffer...
 // TODO: we could avoid reading ShaderDebugStructuredBuffer if we passed the number flag along from the VS
 ShaderDebugLine dbgLine = ShaderDebugStructuredBuffer[PrimitiveID];
 
 // If we got a line, then just re-emit the line coordinates
 if((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_LINE)
 {
  GS_Out.Append(gin[0]);
  GS_Out.Append(gin[1]);
  GS_Out.RestartStrip();
 
  return;
 }
 
 float4 pos = gin[0].Pos;
 
 // Draw cross
 vsOut p;
 p.Color = gin[0].Color;
 
 p.Pos = pos + float4(DigitFontWidth*0.5,0,0,0);
 GS_Out.Append(p);
 p.Pos = pos + float4(-DigitFontWidth*0.5,0,0,0);
 GS_Out.Append(p);
 GS_Out.RestartStrip();
 
 p.Pos = pos + float4(0,DigitFontWidth*0.5,0,0);
 GS_Out.Append(p);
 p.Pos = pos + float4(0,-DigitFontWidth*0.5,0,0);
 GS_Out.Append(p);
 GS_Out.RestartStrip();
 
 // Draw the numbers, as lines
 pos += float4(0,-DigitFontWidth*1.5,0,0);
 float3 number = gin[1].Pos.xyz;
 
 if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT1)
 {
  // Less floats drawn means we can afford more precision without exceeding maxvertexcount
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 12);
 }
 else if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT2) 
 {
  // Less floats drawn means we can afford more precision without exceeding maxvertexcount, 12/2 = 6 digits
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 6);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.y, pos, GS_Out, gin[0].Color, 6);
 }
 else //if ((dbgLine.flag & SHADER_DEBUG_PRIM_MASKBITS) == SHADER_DEBUG_PRIM_FLOAT3)
 {
  // 3*4 we draw 12 digits here...
  DebugDrawFloatHelperGS(number.x, pos, GS_Out, gin[0].Color, 4);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.y, pos, GS_Out, gin[0].Color, 4);
  pos.y -= DigitFontWidth * 2;
  DebugDrawFloatHelperGS(number.z, pos, GS_Out, gin[0].Color, 4);
 }
}

13 May, 2013

I'll create a new trend...

Typography (Helvetica) on Pictures about Technical stuff. My next presentation will be all like this. Oh, shit, I should have added cats too.

In all seriousness though. This "poster" tells the truth, I didn't realize, until recently, how important this lesson is. And how important is, when you follow it, to pick the right thing to specialize upon (as you won't be able to change it much later on...).

I've seen it in the CouchDB guide, where the citation is attributed to Joe Stump. Premature generalization is the root of all evil...

Scaling is Specialization.

Integrating C++11 in your diet

Even for a guy like me who despises C++ and is happy to escape from it as often as possible, the reality of daily work still involves mostly C++ programming.

Being "good" at C++ is mostly a matter of having a good diet. Of course you try to write "sane" C++, staying C as much as possible, using a "safe" subset of the language (1 2 3 etc...), using static code checkers (vs2012 analysis at least, even if I've found it to be quite lax) and so on, these things have been written over an over. The bottom line is, you find your subset of things that are usable and of rules that never should be broken.

Now, parts of the new C++11 standard are coming into mainstream compilers (read, Visual Studio 2012) and so I had to update my "diet" to incorporate a few new, useful features (mostly C++ trying to look like C#, which ain't bad).

This is my small list of things you should consider to start using (at least on PC, for tools etc...).
  • Use today:
    • Auto - Variable type inference. Really, makes a big difference in readability and it's essential for things like stl iterators and so on. It's "deeper" than just shorthand notation as well, as it infers type it always avoids nasty implicit conversions and forces you to write everything explicitly. Also, it propagates changes, so if you change a type (e.g. constness) of a function parameter, you don't have to waste time on all the local types. It also enables new things with templates (but who cares) and lambdas. Note: VaX now supports auto and it shows the inferred type!
    • Lambdas - Simple, much better than function pointers, and also support closures which are the real deal, with a decent, explicit syntax. As C++ doesn't have garbage collection they have restrictions lambdas in other languages don't face, that's to say, you have to think of how you capture things and what are their lifetimes, but it's something we're used to by now (and made "easier" by the explicit capturing syntax, which forces you to think about what you're doing). Still you might want to fallback to regular functor objects when you need to make more explicit what you're doing in the "capturing" constructor/destructor but that's fine. Be sure to know what they really are (typeless objects on the stack... actually, their type can be captured locally by "auto", it avoids a conversion to function<>). Note that "auto" also works on lamba parameters, which is really great, and that you can pass "captureless" lambdas as function pointers too.
    • Type traits are fundamental, now you can static_assert away all your hacks (e.g. memset to zero a type? assert is POD...). True, we had them in Boost already, so this could be seen as "minor", but not many companies in my line of work would like to depend on Boost (even if depending only on traits is reasonable), re-implementing them is not trivial (unlike say, static_assert) and so this being part of the official standard is great. Also, the availability of Boost's ones lowers the preoccupation about compatibility.
    • Range based for - int array[5] = { 1, 2, 3, 4, 5 }; for (int& x : array)... Small, but saves some typing and every other language does have it...
    • Override and Final for virtual functions. Maybe in then years we'll even have "out" for non-const reference/pointer parameters...
  • Would use today, but not yet widespread (that to me, means non implemented by VS2012...):
    • Non-static member initializer - The ability to initialize member variables at the point of declaration, instead of having to add code to your constructors
    • Constexpr - Compile-time constant expressions. Could be nifty, i.e. can remove the need of hacks to do compiletime conversion of strings to hashes...
    • Delegating constructors - (suported in VS2013) A small addition, calling constructors from initializer lists of other constructors, it's useful but we already have workarounds and anyhow, you should really initialize things outside your constructor and never use exceptions. Even less interesting is constructor inheritance.
    • Raw string literals - (supported in VS2013) Another small addition, but important in some contexts, now you can have string literals that don't need escape codes, which is handy.
    • Unrestricted unions - Will enable having unions of types with non-trivial constructors which are not allowed today. No new syntax == good
    • Sizeof of member variables without an instance - The lack of this is really counter-intuitive and maddening 
  • Questionable/proceed with care/better to be avoided if possible:
    • Tl;Dr; don't use anything that adds more rules/alternative syntax for things that can be done already. Don't use templates, especially if you think you really found a cool way to use them (i.e. for anything that does not have to do with collections). Don't read Alexandrescu. Don't be smart.
    • Initializer lists - (suported in VS2013) These are nice, but they add more ways/rules to the resolution of constructors which is never great, function resolution rules in C++ are already way too complex. In some cases they're ok or even the only way to go (containers), but I would prefer to avoid them in custom classes and if there is another way around.
    • Variadic templates - (supported in VS2013) more template hackery. The syntax is quite ugly as well (...... or ...,... or ... ..., yes, let's try everything), but to be fair there are certain uses that might be worth allowing them in your code. An example is std::tuple. For "library" code only.
    • R-value references - They generated a lot of noise and you probably know about them (surely, you'll need to know about them), they do make a big difference in the STL (see this for an introduction) but the truth is, you probably already are careful to avoid temporaries (or objects!) and you don't do much work in your constructors... This is mostly good news for the STL and for the rare reasonable uses of templates (unfortunately, we didn't get concepts... so yes, C++ templates are still awful). They are complex. And that is NOT good, C++ is already obscure enough.
    • Typed enums - This is actually nice, but it adds yet more things to remember to the language, I'm undecided. The main good part of it is that typed enums don't automatically cast to integers (remember that vice-versa is already not true)
    • No_except. You shoulnd't use exceptions anyways.
    • Extern templates - Could reduce code bloat due to templates by not having them instantiated in all translation units. It doesn't mean you don't have to have all your templates in your headers though, it's a bit of a mess to use. You shouldn't use many templates anyhow, right? It's better to use less templates than think "extern" will patch the issue
  • Minor/Already doable with C++98 workarounds/Not often needed
    • __FUNC__ - Officially added to the existing __FILE__ and __LINE__
    • Minimal GC support - You're not likely going to use this, but it's good-to-know.
    • Static_assert - Chances are that you already know what this is and have macros defined. This new one has a better output from the compiler than your own stuff. The standardization of type traits is what makes static_assert very useful though.
    • Alignment - Chances are that you already have some compiler-dependent functions and macros etc defined (and also that you have aligned containers and aligned new, which C++11 still lacks... but hey, support for GC! no aligned new but support for GC... bah...). Chances are, they are clearer, more complete and easier to use than std::align, std::aligned_storage, std::max_align_t and all the crap. Also VS2012.2 std::align seems broken :|
    • Decltype - "Grabs" a type from an expression, fixes some old problems with templates, chances are that you'll never run into this other than some questionable uses in typedef decltype(expression)
    • Nullptr - Fairly minor, tl;dr NULL is now (also) called nullptr, which is a little bit better
    • Foward declaration of enums - Fairly minor, does what it says
    • Explicit conversion operator - (supported in VS2013)  Patches an ugly hole in the language with implicit conversions. You should ban all the implicit conversions (don't implement custom cast operators and mark all constructors as explicit) anyways and always use member functions instead, so you shouldn't find yourself needing it often...It has some usefulness with templates (which you should mostly avoid anyhow...)
    • Explicitly deleting or defaulting auto-generated class functions - Today, you should always remember to declare the functions C++ currently automatically implements for classes (private without implementation if you're not implementing them). This new extension will make that somewhat easier.
I've left out the new library features. C++11 introduced support for concurrency (atomics, threading support, fences, tasks, futures etc...) new smart pointers (unique/shared/weak with their corresponding "make" functions), containers (unordered_map, unordered_set, forward_list, array and std::optional) and so on.
Truth is, they are all nice enough and even needed, but they still fall short of what most people will need when crafting high performance applications (the domain of C++? surely, what we do in realtime rendering...) and chances are you already have rolled your own, optimized versions over these years which could still be even better than what the early compilers will provide on a given platform. E.G. over all these years we still don't have fundamental stuff like a fixed_vector, static_vector, and sorted/unsorted vector/list hybrid (buckets), concurrency is made of threads and not real tasks/jobs (thread pools), still no SIMD/instruction level parallelism etc.

C++ as a language is still so much behind on what matters for performance (regardless of Bjarne's wet dreams), we are and we will still be crafting our own stuff/relying on compiler extensions and intrinsics. We did well with that, we'll do well still.

Rant (can't be avoided when I write about C++): You'll be hearing (or already heard) a lot about "modern" C++, referring to C++11. It's a marketing lie, as most of what they did. Fundamentally, C++11 does not address any of the big issues C++ suffers from (bad defaults, pitfalls, half-arsed templates etc... basically the SIZE of the language and the quality of it), instead it's mostly concerned with "catching up" the back of the box feature list (and making an half-arsed attempt at that, as most things can't be done properly anyways...).
It doesn't even attempt to deprecate anything, it managed to kill the most useful features devoted at simplifying it (template concepts!), it adds a TON of new syntax while keeping the old defaults (no_except, the controls for automatic class functions...) thus hoping that you just remember to use it, and it adds a TON of features squarely aimed at crazy-template-metaprogramming users that most sane people will never allow anyways.
We don't do obfuscated C++ contests because it would be to easy already, with C++11, it would become really crazy...

If you want a full overview, see:

Peek'n'Poke

Sometimes I write tools small and stupid enough to be contained in a blog post. This in one of them...

I always wanted to have graphical visualizers inside visual studio, to see matrices, points, images and such things from raw memory locations. It turns out that's very simple if you just ReadProcessMemory from an external tool, even simpler than writing a Visual Studio extension. Of course, this doesn't work when remote debugging (and the simplest option there would be to write a server or just something intrusive in the code). 



This small C# sample does display images from a process memory, refreshing every 33ms, it supports a few formats (r8 is broken as I was too lazy to set the palette, expect bugs in general...) but it could be easily extended to do whatever you need (i.e. graph floats in time...). 

Enjoy!

P.S. If you extend/fix/find anything incredibly dumb in the code below, leave a comment! Thanks...

In the future, it would be really cool to have a dynamic debugging/program visualization tool. There is already quite some work, also if you look in the reversing/hacking community.


Update: Now with floating point images support and endian swaps...
// See http://blackandodd.blogspot.ca/2012/12/c-read-and-write-process-memory-in.html
// and http://www.mpgh.net/forum/250-c-programming/298510-c-writeprocessmemory-readprocessmemory.html
 
using System;
 
namespace Peek
{
    class Program
    {
#region Kernel Imports
        // http://msdn.microsoft.com/en-us/library/windows/desktop/ms684880(v=vs.85).aspx
        const uint ACL_DELETE = 0x00010000;
        const uint ACL_READ_CONTROL = 0x00020000;
        const uint ACL_WRITE_DAC = 0x00040000;
        const uint ACL_WRITE_OWNER = 0x00080000;
        const uint ACL_SYNCHRONIZE = 0x00100000;
        const uint ACL_END = 0xFFF; //if you have Windows XP or Windows Server 2003 you must change this to 0xFFFF
        const uint PROCESS_VM_READ = 0x0010;
        const uint PROCESS_VM_WRITE = 0x0020;
        const uint PROCESS_VM_OPERATION = 0x0008;
        const uint PROCESS_ALL_ACCESS = (ACL_DELETE | ACL_READ_CONTROL | ACL_WRITE_DAC | ACL_WRITE_OWNER | ACL_SYNCHRONIZE | ACL_END);
 
        [System.Runtime.InteropServices.DllImport("kernel32.dll")]
        static extern uint OpenProcess(uint dwDesiredAccessbool bInheritHandleint dwProcessId);
        [System.Runtime.InteropServices.DllImport("kernel32.dll")]
        static extern bool ReadProcessMemory(uint hProcess, UIntPtr lpBaseAddress, IntPtr bufferuint sizeuint lpNumberOfBytesRead);
        /*[System.Runtime.InteropServices.DllImport("kernel32.dll")]
        static extern bool WriteProcessMemory(uint hProcess, UIntPtr lpBaseAddress, byte[] buffer, uint size, uint lpNumberOfBytesWritten);
        [System.Runtime.InteropServices.DllImport("kernel32.dll")]
        static extern bool WriteProcessMemory(uint hProcess, UIntPtr lpBaseAddress, IntPtr buffer, uint size, uint lpNumberOfBytesWritten);*/
 
        class UnmanagedMemWrapper // should we GC.AddMemoryPressure?
        {
            public UnmanagedMemWrapper(uint size)
            {
                this.ptr = System.Runtime.InteropServices.Marshal.AllocHGlobal((int)size);
            }
            ~UnmanagedMemWrapper()
            {
                System.Runtime.InteropServices.Marshal.FreeHGlobal(ptr);
            }
            
            public IntPtr ptr;
        }
#endregion // Kernel Imports
 
 
        // Utility, half2float, could use DirectXMath DirectX::PackedVector functions instead...
        [System.Runtime.InteropServices.DllImport("d3dx9_35.dll")]
        public static extern void D3DXFloat16To32Array(float[] output, IntPtr inputuint nfloats);
 
        static void PrintUsageAndErrors(string error)
        {
            System.Console.WriteLine("Peek");
            System.Console.WriteLine("----");
            System.Console.WriteLine();
            System.Console.WriteLine("Arguments: process name, instance number, pointer address, [peek mode]");
            System.Console.WriteLine("Note that multiple processes can have the same name...");
            System.Console.WriteLine();
            System.Console.WriteLine("Peek mode:");
            System.Console.WriteLine(" img [format] xsize ysize -- draws a 2d image");
            System.Console.WriteLine("  Supported formats: argb8 argb16 rgb8 rgb16 r8 r16 argb32f argb16f rgb32f rgb16f r32f r16f");
            System.Console.WriteLine();           
 
            if(error.Length!=0)
            {
                System.Console.WriteLine("Error!");
                System.Console.WriteLine(error);
            }
        }
 
        [STAThreadstatic void Main(string[] args)
        {
            if (args.Length < 5)
            {
                PrintUsageAndErrors("Not enough arguments"); return;
            }
 
            var procs = System.Diagnostics.Process.GetProcessesByName(args[0]);
            UInt32 procNumber = 0;
 
            if (!UInt32.TryParse(args[1], out procNumber))
            {
                PrintUsageAndErrors("Can't parse process number"); return;
            }
 
            if (procs.Length <= procNumber)
            {
                PrintUsageAndErrors("Process instance not found"); return;
            }
 
            var proc = procs[procNumber];
            uint procHandle = OpenProcess(PROCESS_VM_READfalseproc.Id);
 
            if (procHandle == 0)
            {
                PrintUsageAndErrors("Failed to open process"); return;
            }
 
            switch (args[3])
            {
                case "img":
                    {
                        UInt32 xsizeysize;
                        if ((!UInt32.TryParse(args[5], out xsize)) || (!UInt32.TryParse(args[6], out ysize)))
                        {
                            PrintUsageAndErrors("Can't parse img size"); return;
                        }
 
                        switch (args[4])
                        {
                            case "argb8":
                                PeekImg(procHandleargs[2], xsizeysize, 4, System.Drawing.Imaging.PixelFormat.Format32bppArgbImgOP.NONE);
                                break;
                            case "rgb8":
                                PeekImg(procHandleargs[2], xsizeysize, 3, System.Drawing.Imaging.PixelFormat.Format24bppRgbImgOP.NONE);
                                break;
                            case "argb16":
                                PeekImg(procHandleargs[2], xsizeysize, 8, System.Drawing.Imaging.PixelFormat.Format64bppArgbImgOP.NONE);
                                break;
                            case "rgb16":
                                PeekImg(procHandleargs[2], xsizeysize, 6, System.Drawing.Imaging.PixelFormat.Format48bppRgbImgOP.NONE);
                                break;
                            case "r8":
                                PeekImg(procHandleargs[2], xsizeysize, 1, System.Drawing.Imaging.PixelFormat.Format8bppIndexedImgOP.NONE);
                                break;
                            case "r16":
                                PeekImg(procHandleargs[2], xsizeysize, 2, System.Drawing.Imaging.PixelFormat.Format16bppGrayScaleImgOP.NONE);
                                break;
                            case "argb32f":
                                PeekImg(procHandleargs[2], xsizeysize, 4, System.Drawing.Imaging.PixelFormat.Format32bppArgbImgOP.F32_TO_I8);
                                break;
                            case "rgb32f":
                                PeekImg(procHandleargs[2], xsizeysize, 3, System.Drawing.Imaging.PixelFormat.Format24bppRgbImgOP.F32_TO_I8);
                                break;
                            case "argb16f":
                                PeekImg(procHandleargs[2], xsizeysize, 4, System.Drawing.Imaging.PixelFormat.Format32bppArgbImgOP.F16_TO_I8);
                                break;
                            case "rgb16f":
                                PeekImg(procHandleargs[2], xsizeysize, 3, System.Drawing.Imaging.PixelFormat.Format24bppRgbImgOP.F16_TO_I8);
                                break;
                            case "r32f":
                                PeekImg(procHandleargs[2], xsizeysize, 1, System.Drawing.Imaging.PixelFormat.Format8bppIndexedImgOP.F32_TO_I8);
                                break;
                            case "r16f":
                                PeekImg(procHandleargs[2], xsizeysize, 1, System.Drawing.Imaging.PixelFormat.Format8bppIndexedImgOP.F16_TO_I8);
                                break;
                            default:
                                PrintUsageAndErrors("Unknown image format");
                                return;
                        }
 
                        break;
                    }
                default:
                    PrintUsageAndErrors("Unknown peek options");
                    return;
            }
        }
        
        enum ImgOP { NONEF16_TO_I8F32_TO_I8 }
 
        class PeekImgForm : System.Windows.Forms.Form
        {
            public PeekImgForm()
            {
                DoubleBuffered = true;
                Text = "Peeker";
 
                Controls.Add(memControl); 
                Controls.Add(hdrScale);
                Controls.Add(noAlphaButton); 
                Controls.Add(endianSwapButton);
                Controls.Add(fillBlackButton);                
                Controls.Add(RBSwapButton);
                Controls.Add(xresControl);
                Controls.Add(yresControl);
                Controls.Add(resetButton);
 
                resetButton.Click += delegate(object senderSystem.EventArgs e)
                {
                    CreateBuffers();
                };
 
                var background = new System.Drawing.Drawing2D.HatchBrush(
                    System.Drawing.Drawing2D.HatchStyle.LargeCheckerBoardSystem.Drawing.Color.BlackSystem.Drawing.Color.White);
 
                Paint += delegate(object senderSystem.Windows.Forms.PaintEventArgs e)
                {
                    if (!ReadProcessMemory(procHandlepointerunmanagedMemory.ptrreadSize, 0))
                    {
                        e.Graphics.FillRectangle(System.Drawing.Brushes.Red, 0, 0, Bounds.WidthBounds.Height);
                        return;
                    }
 
                    float scale = (float)hdrScale.Value * 255.0f;
 
                    if ((format == System.Drawing.Imaging.PixelFormat.Format64bppArgb) ||
                        (format == System.Drawing.Imaging.PixelFormat.Format48bppRgb)) // these are not 16bpp, but 13, really
                    {
                        unsafe
                        {
                            ushortushortPtr = (ushort*)unmanagedMemory.ptr;
                            for (int i = 0; i < imageSize / 2; i++)
                                ushortPtr[i] >>= 3;
                        }
                    }
 
                    if (imgOp == ImgOP.F16_TO_I8)
                    {
                        D3DXFloat16To32Array(tempHalfToFloatMemoryunmanagedMemory.ptr, (uint)tempHalfToFloatMemory.Length);
                        unsafe
                        {
                            fixed (floatfloatPtr = tempHalfToFloatMemory)
                            {
                                bytebytePtr = (byte*)unmanagedMemory.ptr;
                                for (int i = 0; i < imageSizei++)
                                {
                                    float scaledVal = floatPtr[i] * scale;
                                    bytePtr[i] = (byte)(scaledVal > 255.0f ? 255.0f : scaledVal);
                                }
                            }
                        }
                    }
                    else if (imgOp == ImgOP.F32_TO_I8)
                    {
                        unsafe
                        {
                            bytebytePtr = (byte*)unmanagedMemory.ptr;
                            floatfloatPtr = (float*)unmanagedMemory.ptr;
                            /*for (int i = 0; i < imageSize; i += 4)
                            {
                                floatPtr[i] /= floatPtr[i + 3];
                                floatPtr[i+1] /= floatPtr[i + 3];
                                floatPtr[i+2] /= floatPtr[i + 3];
                            }*/
                            for (int i = 0; i < imageSizei++)
                            {
                                float scaledVal = floatPtr[i] * scale;
                                bytePtr[i] = (byte)(scaledVal > 255.0f ? 255.0f : scaledVal);
                            }
                        }
                    }
 
                    if (endianSwapButton.Checked)
                    {
                        unsafe
                        {
                            bytebytePtr = (byte*)unmanagedMemory.ptr;
                            for (int i = 0; i < imageSizei += 4)
                            {
                                byte temp = bytePtr[i + 3];
                                bytePtr[i + 3] = bytePtr[i];
                                bytePtr[i] = temp;
                                temp = bytePtr[i + 2];
                                bytePtr[i + 2] = bytePtr[i + 1];
                                bytePtr[i + 1] = temp;
                            }
                        }
                    }
 
                    if (RBSwapButton.Checked// Loop again, I don't want to code the variants...
                    {
                        unsafe
                        {
                            bytebytePtr = (byte*)unmanagedMemory.ptr;
                            for (int i = 0; i < imageSizei += 4)
                            {
                                byte temp = bytePtr[i + 2];
                                bytePtr[i + 2] = bytePtr[i];
                                bytePtr[i] = temp;
                            }
                        }
                    }
 
                    /*var data = bitmap.LockBits(new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height)
                        , System.Drawing.Imaging.ImageLockMode.WriteOnly, bitmap.PixelFormat);
                    System.Diagnostics.Debug.Assert(data.Scan0 == unmanagedMemory.ptr);
                    bitmap.UnlockBits(data);*/
 
                    if (fillBlackButton.Checked)
                        e.Graphics.FillRectangle(System.Drawing.Brushes.Black, 0, 0, Bounds.WidthBounds.Height);
                    else
                        e.Graphics.FillRectangle(background, 0, 0, Bounds.WidthBounds.Height); // Draw a pattern to be able to "see" alpha...
 
                    if (noAlphaButton.Checked// TODO: add scaling options...
                        e.Graphics.DrawImage(bitmapnew System.Drawing.Rectangle(0, 60, bitmap.Widthbitmap.Height), 0, 0, bitmap.Widthbitmap.HeightSystem.Drawing.GraphicsUnit.PixelimageAttributesKillAlpha );
                    else
                        e.Graphics.DrawImageUnscaled(bitmap, 0, 60);
                };
            }
 
            public void SetParams(uint procHandlestring ptrStringSystem.Drawing.Imaging.PixelFormat formatuint xsizeuint ysizeuint bytesPPImgOP imgOpbool enableImgButtonsbool enableHDRButtons)
            {
                memControl.Text = ptrString;
                xresControl.Value = xsize;
                yresControl.Value = ysize;
 
                this.bytesPP = bytesPP;
                this.imgOp = imgOp;
                this.procHandle = procHandle;
                this.format = format;
 
                if (!enableImgButtons)
                {
                    endianSwapButton.Enabled = false;
                    RBSwapButton.Enabled = false;
                }
 
                if (!enableHDRButtons)
                {
                    hdrScale.Enabled = false;
                }
 
                Refresh();
            }
 
            public void CreateBuffers()
            {
                imageSize = (uint)xresControl.Value * bytesPP * (uint)yresControl.Value;
                readSize = imageSize;
                if (imgOp == ImgOP.F16_TO_I8)
                {
                    tempHalfToFloatMemory = new float[imageSize];
                    readSize *= 2;
                }
                else if (imgOp == ImgOP.F32_TO_I8)
                {
                    readSize *= 4;
                }
                unmanagedMemory = new UnmanagedMemWrapper(readSize);
 
                bitmap = new System.Drawing.Bitmap(
                    (int)xresControl.Value, (int)yresControl.Value, (int)(xresControl.Value * bytesPP), formatunmanagedMemory.ptr
                );
 
                System.Drawing.Imaging.ColorPalette palette = bitmap.Palette;
                if (palette.Entries.Length != 0)
                {
                    for (int i = 0; i < palette.Entries.Lengthi++)
                        palette.Entries.SetValue(System.Drawing.Color.FromArgb(255, iii), i);
                    bitmap.Palette = palette// weird dance...
                }
 
                imageAttributesKillAlpha = new System.Drawing.Imaging.ImageAttributes();
 
                float[][] colorMatrixElements = { 
                    new float[] {1, 0, 0, 0, 0}, // red scale
                    new float[] {0, 1, 0, 0, 0}, // green scale
                    new float[] {0, 0, 1, 0, 0}, // blue scale
                    new float[] {0, 0, 0, 1, 0}, // alpha scale
                    new float[] {0, 0, 0, 1, 1}}; // translation
                imageAttributesKillAlpha.SetColorMatrix(
                    new System.Drawing.Imaging.ColorMatrix(colorMatrixElements), System.Drawing.Imaging.ColorMatrixFlag.DefaultSystem.Drawing.Imaging.ColorAdjustType.Bitmap
                ); // TODO: RGB swaps and R-G-B channel selections and so on can/should be done with a matrix instead of the way they are currently implemented (i.e. endianSwapButton...)
 
                UInt64 pointerInt = 0;
                if (memControl.Text.StartsWith("0x"))
                {
                    try
                    {
                        pointerInt = Convert.ToUInt64(memControl.Text.Substring(2), 16);
                    }
                    catch (System.Exception) { memControl.Text = "Can't parse ptr"; }
                }
                else if (!UInt64.TryParse(memControl.Textout pointerInt))
                {
                    memControl.Text = "Can't parse ptr";
                }
                pointer = new UIntPtr(pointerInt);
 
                Refresh();
            }
 
            uint imageSize = 0;
            uint readSize = 0;
            float[] tempHalfToFloatMemory = null;
            UnmanagedMemWrapper unmanagedMemory = null;
            UIntPtr pointer = new UIntPtr(0);
            System.Drawing.Bitmap bitmap = null;
 
            uint bytesPP;
            ImgOP imgOp;
            uint procHandle;
            System.Drawing.Imaging.PixelFormat format;
            System.Drawing.Imaging.ImageAttributes imageAttributesKillAlpha;
 
            // Meh, there was no reason to do all this by hand really...
            System.Windows.Forms.CheckBox noAlphaButton = new System.Windows.Forms.CheckBox() { Text = "NoAlpha"Left = 0, Width = 70 };
            System.Windows.Forms.CheckBox endianSwapButton = new System.Windows.Forms.CheckBox() { Text = "Endian"Left = 70, Width = 70 };
            System.Windows.Forms.CheckBox RBSwapButton = new System.Windows.Forms.CheckBox() { Text = "RB Swap"Left = 140, Width = 70 };            
            System.Windows.Forms.NumericUpDown hdrScale = new System.Windows.Forms.NumericUpDown() { DecimalPlaces = 2, Minimum = -999999, Maximum = 999999, Increment = 0.25m, Value = 1, Left = 330, Width = 50 };
            System.Windows.Forms.CheckBox fillBlackButton = new System.Windows.Forms.CheckBox() { Text = "Black Backgr."Left = 380, Width = 70 };
 
            System.Windows.Forms.NumericUpDown xresControl = new System.Windows.Forms.NumericUpDown() { Minimum = 0, Maximum = 9999, Top = 25, Left = 0, Width = 105 };
            System.Windows.Forms.NumericUpDown yresControl = new System.Windows.Forms.NumericUpDown() { Minimum = 0, Maximum = 9999, Top = 25, Left = 105, Width = 105 };
            System.Windows.Forms.TextBox memControl = new System.Windows.Forms.TextBox() { Left = 210, Width = 170, Top = 25 };
            System.Windows.Forms.Button resetButton = new System.Windows.Forms.Button() { Text = "Region Update"Left = 380, Top = 25, Width = 100 };
          
        }
 
        static void PeekImg(
            uint procHandlestring ptrString, UInt32 xsize, UInt32 ysize, UInt32 bytesPP,
            System.Drawing.Imaging.PixelFormat formatImgOP imgOp = ImgOP.NONE
        ) // TODO: move the format params into a drop-down of the form, instead of having to specify by hand in the commandline...
        {
            using (var form = new PeekImgForm())
            {
 
                var timer = new System.Windows.Forms.Timer() { Interval = 33, Enabled = true };
                timer.Tick += delegate(object senderEventArgs e)
                {
                    //form.Refresh(); // TODO: Enable-Disable auto refresh switch, via a command-line switch or a checkbox
                };
 
                form.SetBounds(0, 0, xsize > 600 ? (int)xsize : 600, (int)ysize + 100);
                form.SetParams(procHandleptrStringformatxsizeysizebytesPPimgOp,
                    format == System.Drawing.Imaging.PixelFormat.Format32bppArgb, 
                    imgOp != ImgOP.NONE
                ); 
                form.CreateBuffers();         
 
                // Run...
                System.Windows.Forms.Application.EnableVisualStyles();
                form.Show(); form.Focus(); timer.Start();
                System.Windows.Forms.Application.Run(form);
            }
        }
    }
}