[Cmake-commits] [cmake-commits] king committed CMakeLists.txt 1.434 1.435 cmXMLSafe.cxx 1.6 1.7 cm_utf8.c NONE 1.1 cm_utf8.h NONE 1.1

cmake-commits at cmake.org cmake-commits at cmake.org
Tue Dec 8 15:43:58 EST 2009


Update of /cvsroot/CMake/CMake/Source
In directory public:/mounts/ram/cvs-serv1213/Source

Modified Files:
	CMakeLists.txt cmXMLSafe.cxx 
Added Files:
	cm_utf8.c cm_utf8.h 
Log Message:
CTest: Do not munge UTF-8 output in XML files

CTest filters the output from tools and tests to ensure that the XML
build/test result documents it generates have valid characters.
Previously we just converted all non-ASCII bytes into XML-escaped
Unicode characters of the corresponding index.  This does not preserve
tool output encoded in UTF-8.

We now assume UTF-8 output from tools and implement decoding as
specified in RFC 3629.  Valid characters are preserved, possibly with
XML escaping.  Invalid byte sequences and characters are converted to
human-readable hex values with distinguishing tags.  See issue #10003.


--- NEW FILE: cm_utf8.h ---
/*============================================================================
  CMake - Cross Platform Makefile Generator
  Copyright 2000-2009 Kitware, Inc., Insight Software Consortium

  Distributed under the OSI-approved BSD License (the "License");
  see accompanying file Copyright.txt for details.

  This software is distributed WITHOUT ANY WARRANTY; without even the
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  See the License for more information.
============================================================================*/
#ifndef cm_utf8_h
#define cm_utf8_h

#ifdef __cplusplus
extern "C" {
#endif

/** Decode one UTF-8 character from the input byte range.  On success,
    stores the unicode character number in *pc and returns the first
    position not extracted.  On failure, returns 0.  */
const char* cm_utf8_decode_character(const char* first, const char* last,
                                     unsigned int* pc);

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif

--- NEW FILE: cm_utf8.c ---
/*============================================================================
  CMake - Cross Platform Makefile Generator
  Copyright 2000-2009 Kitware, Inc., Insight Software Consortium

  Distributed under the OSI-approved BSD License (the "License");
  see accompanying file Copyright.txt for details.

  This software is distributed WITHOUT ANY WARRANTY; without even the
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  See the License for more information.
============================================================================*/
#include "cm_utf8.h"

/*
  RFC 3629
  07-bit: 0xxxxxxx
  11-bit: 110xxxxx 10xxxxxx
  16-bit: 1110xxxx 10xxxxxx 10xxxxxx
  21-bit: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

  Pre-RFC Compatibility
  26-bit: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  31-bit: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/

/* Number of leading ones before a zero in the byte.  */
static unsigned char const cm_utf8_ones[256] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8
};

/* Mask away control bits from bytes with n leading ones.  */
static unsigned char const cm_utf8_mask[7] = {
  0xEF, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
};

/* Minimum allowed value when first byte has n leading ones.  */
static unsigned int const cm_utf8_min[7] = {
  0, 0, 1u<<7, 1u<<11, 1u<<16, 1u<<21, 1u<<26 /*, 1u<<31 */
};

/*--------------------------------------------------------------------------*/
const char* cm_utf8_decode_character(const char* first, const char* last,
                                     unsigned int* pc)
{
  /* Count leading ones in the first byte.  */
  unsigned char c = *first++;
  unsigned char const ones = cm_utf8_ones[c];
  switch(ones)
    {
    case 0: *pc = c; return first;    /* One-byte character.  */
    case 1: case 7: case 8: return 0; /* Invalid leading byte.  */
    default: break;
    }

  /* Extract bits from this multi-byte character.  */
  {
  unsigned int uc = c & cm_utf8_mask[ones];
  unsigned char left;
  for(left = ones-1; left && first != last; --left)
    {
    c = *first++;
    if(cm_utf8_ones[c] != 1)
      {
      return 0;
      }
    uc = (uc << 6) | (c & cm_utf8_mask[1]);
    }

  if(left > 0 || uc < cm_utf8_min[ones])
    {
    return 0;
    }

  *pc = uc;
  return first;
  }
}

Index: cmXMLSafe.cxx
===================================================================
RCS file: /cvsroot/CMake/CMake/Source/cmXMLSafe.cxx,v
retrieving revision 1.6
retrieving revision 1.7
diff -C 2 -d -r1.6 -r1.7
*** cmXMLSafe.cxx	28 Sep 2009 15:42:50 -0000	1.6
--- cmXMLSafe.cxx	8 Dec 2009 20:43:55 -0000	1.7
***************
*** 12,15 ****
--- 12,17 ----
  #include "cmXMLSafe.h"
  
+ #include "cm_utf8.h"
+ 
  #include <cmsys/ios/iostream>
  #include <cmsys/ios/sstream>
***************
*** 54,95 ****
    char const* first = self.Data;
    char const* last = self.Data + self.Size;
!   for(char const* ci = first; ci != last; ++ci)
      {
!     unsigned char c = static_cast<unsigned char>(*ci);
!     switch(c)
        {
!       case '&': os << "&amp;"; break;
!       case '<': os << "&lt;"; break;
!       case '>': os << "&gt;"; break;
!       case '"': os << (self.DoQuotes? "&quot;" : "\""); break;
!       case '\'': os << (self.DoQuotes? "&apos;" : "'"); break;
!       case '\t': os << "\t"; break;
!       case '\n': os << "\n"; break;
!       case '\r': break; // Ignore CR
!       default:
!         if(c >= 0x20 && c <= 0x7f)
!           {
!           os.put(static_cast<char>(c));
!           }
!         else
            {
!           // TODO: More complete treatment of program output character
!           // encoding.  Instead of escaping these bytes, we should
!           // handle the current locale and its encoding.
!           char buf[16];
!           // http://www.w3.org/TR/REC-xml/#NT-Char
!           if(c >= 0x80)
!             {
!             sprintf(buf, "&#x%hx;", static_cast<unsigned short>(c));
!             }
!           else
!             {
!             // We cannot use "&#x%hx;" here because this value is not
!             // valid in XML.  Instead use a human-readable hex value.
!             sprintf(buf, "&lt;0x%hx&gt;", static_cast<unsigned short>(c));
!             }
!           os << buf;
            }
!         break;
        }
      }
--- 56,100 ----
    char const* first = self.Data;
    char const* last = self.Data + self.Size;
!   while(first != last)
      {
!     unsigned int ch;
!     if(const char* next = cm_utf8_decode_character(first, last, &ch))
        {
!       // http://www.w3.org/TR/REC-xml/#NT-Char
!       if((ch >= 0x20 && ch <= 0xD7FF) ||
!          (ch >= 0xE000 && ch <= 0xFFFD) ||
!          (ch >= 0x10000 && ch <= 0x10FFFF) ||
!           ch == 0x9 || ch == 0xA || ch == 0xD)
!         {
!         switch(ch)
            {
!           // Escape XML control characters.
!           case '&': os << "&amp;"; break;
!           case '<': os << "&lt;"; break;
!           case '>': os << "&gt;"; break;
!           case '"': os << (self.DoQuotes? "&quot;" : "\""); break;
!           case '\'': os << (self.DoQuotes? "&apos;" : "'"); break;
!           case '\r': break; // Ignore CR
!           // Print the UTF-8 character.
!           default: os.write(first, next-first); break;
            }
!         }
!       else
!         {
!         // Use a human-readable hex value for this invalid character.
!         char buf[16];
!         sprintf(buf, "%X", ch);
!         os << "[NON-XML-CHAR-0x" << buf << "]";
!         }
! 
!       first = next;
!       }
!     else
!       {
!       ch = static_cast<unsigned char>(*first++);
!       // Use a human-readable hex value for this invalid byte.
!       char buf[16];
!       sprintf(buf, "%X", ch);
!       os << "[NON-UTF-8-BYTE-0x" << buf << "]";
        }
      }

Index: CMakeLists.txt
===================================================================
RCS file: /cvsroot/CMake/CMake/Source/CMakeLists.txt,v
retrieving revision 1.434
retrieving revision 1.435
diff -C 2 -d -r1.434 -r1.435
*** CMakeLists.txt	8 Dec 2009 16:44:28 -0000	1.434
--- CMakeLists.txt	8 Dec 2009 20:43:55 -0000	1.435
***************
*** 247,250 ****
--- 247,253 ----
    cmakewizard.cxx
    cmakewizard.h
+ 
+   cm_utf8.h
+   cm_utf8.c
    )
  



More information about the Cmake-commits mailing list