mailnews/import/outlook/src/rtfDecoder.cpp
author David Bienvenu <bienvenu@nventure.com>
Tue, 17 May 2011 12:27:14 -0700
changeset 7784 90c3929c5b5d13d2e677fff21e64cf0831f55e79
child 8286 7ac67e03558bfe35ec3d04d80a63d6cba17cf306
permissions -rw-r--r--
Mike Kaganski <mikekaganski@gmail.com>

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code
 *
 * The Initial Developer of the Original Code is
 * Mike Kaganski <mikekaganski@gmail.com>.
 * Portions created by the Initial Developer are Copyright (C) 2010
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <locale>
#include <stack>
#include <map>
#include <sstream>
#include "Windows.h"
#include "rtfDecoder.h"

#define SIZEOF(x) (sizeof(x)/sizeof((x)[0]))

inline int HexToInt(char ch)
{
  switch (ch) {
  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
    return ch-'0';
  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    return ch-'A'+10;
  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    return ch-'a'+10;
  default:
    return 0;
  }
}

inline int CharsetToCP(int charset)
{
  // We don't know the Code page for the commented out charsets.
  switch (charset) {
  case 0: return 1252; // ANSI
  case 1: return 0;   // Default
//case 2: return 42; // Symbol
  case 2: return 1252; // Symbol
  case 77: return 10000; // Mac Roman
  case 78: return 10001; // Mac Shift Jis
  case 79: return 10003; // Mac Hangul
  case 80: return 10008; // Mac GB2312
  case 81: return 10002; // Mac Big5
//case 82: Mac Johab (old)
  case 83: return 10005; // Mac Hebrew
  case 84: return 10004; // Mac Arabic
  case 85: return 10006; // Mac Greek
  case 86: return 10081; // Mac Turkish
  case 87: return 10021; // Mac Thai
  case 88: return 10029; // Mac East Europe
  case 89: return 10007; // Mac Russian
  case 128: return 932; // Shift JIS
  case 129: return 949; // Hangul
  case 130: return 1361; // Johab
  case 134: return 936; // GB2312
  case 136: return 950; // Big5
  case 161: return 1253; // Greek
  case 162: return 1254; // Turkish
  case 163: return 1258; // Vietnamese
  case 177: return 1255; // Hebrew
  case 178: return 1256; // Arabic
//case 179: Arabic Traditional (old)
//case 180: Arabic user (old)
//case 181: Hebrew user (old)
  case 186: return 1257; // Baltic
  case 204: return 1251; // Russian
  case 222: return 874; // Thai
  case 238: return 1250; // Eastern European
  case 254: return 437; // PC 437
  case 255: return 850; // OEM
  default: return CP_ACP;
  }
}

struct FontInfo {
  enum Options {has_fcharset = 0x0001,
                has_cpg      = 0x0002};
  unsigned int options;
  int fcharset;
  unsigned int cpg;
  FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}
  unsigned int Codepage()
  {
    if (options & has_cpg)
      return cpg;
    else if (options & has_fcharset)
      return CharsetToCP(fcharset);
    else return 0xFFFFFFFF;
  }
};
typedef std::map<int, FontInfo> Fonttbl;

struct LocalState {
  bool fonttbl;         // When fonts are being defined
  int f;                // Index of the font being defined/used; defines the codepage if no \cpg
  unsigned int uc;      // ucN keyword value; its default is 1
  unsigned int codepage;// defined by \cpg
};
typedef std::stack<LocalState> StateStack;

struct GlobalState {
  enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };
  std::istream& stream;
  Fonttbl fonttbl;
  StateStack stack;
  unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN
  int deff;
  std::stringstream pcdata_a;
  unsigned int pcdata_a_codepage;
  Pcdata_state pcdata_a_state;

  GlobalState(std::istream& s)
    : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno)
  {
    LocalState st;
    st.fonttbl = false;
    st.f = -1;
    st.uc = 1;
    st.codepage = 0xFFFFFFFF;
    stack.push(st);
  }
  unsigned int GetCurrentCP()
  {
    if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use
      return stack.top().codepage;
    // \cpg not used; use font settings
    int f = (stack.top().f != -1) ? stack.top().f : deff; 
    if (f != -1) {
      Fonttbl::iterator iter = fonttbl.find(f);
      if (iter != fonttbl.end()) {
        unsigned int cp = iter->second.Codepage();
        if (cp != 0xFFFFFFFF)
          return cp;
      }
    }
    return codepage; // No overrides; use the top-level legacy setting
  }
};

struct Keyword {
  char name[33];
  bool hasVal;
  int val;
};

class Lexem {
public:
  enum Type {ltGroupBegin, ltGroupEnd, ltKeyword, ltPCDATA_A, ltPCDATA_W,
             ltBDATA, ltEOF, ltError};
  Lexem(Type t=ltError) : m_type(t) {}
  Lexem(Lexem& from) // Move pointers when copying
  {
    switch (m_type = from.m_type) {
    case ltKeyword:
      m_keyword = from.m_keyword;
      break;
    case ltPCDATA_A:
      m_pcdata_a = from.m_pcdata_a;
      break;
    case ltPCDATA_W:
      m_pcdata_w = from.m_pcdata_w;
      break;
    case ltBDATA:
      m_bdata = from.m_bdata;
      from.m_type = ltError;
      break;
    }
  }
  ~Lexem() { Clear(); }
  Lexem& operator = (Lexem& from)
  {
    if (&from != this) {
      Clear();
      switch (m_type = from.m_type) {
      case ltKeyword:
        m_keyword = from.m_keyword;
        break;
      case ltPCDATA_A:
        m_pcdata_a = from.m_pcdata_a;
        break;
      case ltPCDATA_W:
        m_pcdata_w = from.m_pcdata_w;
        break;
      case ltBDATA:
        m_bdata = from.m_bdata;
        from.m_type = ltError;
        break;
      }
    }
    return *this;
  }
  Type type() const { return m_type; }
  void SetPCDATA_A(char chdata)
  {
    Clear();
    m_pcdata_a = chdata;
    m_type = ltPCDATA_A;
  }
  void SetPCDATA_W(wchar_t chdata)
  {
    Clear();
    m_pcdata_w = chdata;
    m_type = ltPCDATA_W;
  }
  void SetBDATA(const char* data, int sz)
  {
    char* tmp = new char[sz]; // to allow getting the data from itself
    if (tmp) {
      memcpy(tmp, data, sz);
      Clear();
      m_bdata.data = tmp;
      m_bdata.sz = sz;
      m_type = ltBDATA;
    }
    else m_type = ltError;
  }
  void SetKeyword(const Keyword& src)
  {
    Clear();
    m_type = ltKeyword;
    m_keyword = src;
  }
  void SetKeyword(const char* name, bool hasVal=false, int val=0)
  {
    char tmp[SIZEOF(m_keyword.name)];
    strncpy(tmp, name, SIZEOF(m_keyword.name)-1); // to allow copy drom itself
    tmp[SIZEOF(m_keyword.name)-1]=0;
    Clear();
    m_type = ltKeyword;
    memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));
    m_keyword.hasVal=hasVal;
    m_keyword.val=val;
  }
  const char* KeywordName() const {
    return (m_type == ltKeyword) ? m_keyword.name : 0; }
  const int* KeywordVal() const {
    return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0; }
  char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }
  wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }
  const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }
  int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }
  static Lexem eof;
  static Lexem groupBegin;
  static Lexem groupEnd;
  static Lexem error;
private:
  struct BDATA {
    size_t sz;
    char* data;
  };

  Type m_type;
  union {
    Keyword m_keyword;
    char m_pcdata_a;
    wchar_t m_pcdata_w;
    BDATA m_bdata;
  };
  // This function leaves the object in the broken state. Must be followed
  // by a correct initialization.
  void Clear() 
  {
    switch (m_type) {
    case ltBDATA:
      delete[] m_bdata.data;
      break;
    }
//  m_type = ltError;
  }
};

Lexem Lexem::eof(ltEOF);
Lexem Lexem::groupBegin(ltGroupBegin);
Lexem Lexem::groupEnd(ltGroupEnd);
Lexem Lexem::error(ltError);

// This function moves pos. When calling the function, pos must be next to the
// backslash; pos must be in the same sequence and before end!
Keyword GetKeyword(std::istream& stream)
{
  Keyword keyword = {"", false, 0};
  char ch;
  if (stream.get(ch).eof())
    return keyword;
  // Control word; maybe delimiter and value
  if (std::isalpha(ch, std::locale::classic())) { 
    int i = 0;
    do {
      // We take up to 32 characters into account, skipping over extra
      // characters (allowing for some non-conformant implementation).
      if (i < 32)
        keyword.name[i++] = ch;
    } while (!stream.get(ch).eof() && std::isalpha(ch, std::locale::classic()));
    keyword.name[i] = 0; // NULL-terminating
    if (!stream.eof() && (std::isdigit(ch, std::locale::classic()) || (ch == '-'))) { // Value begin
      keyword.hasVal = true;
      bool negative = (ch == '-');
      if (negative) stream.get(ch);
      i = 0;
      while (!stream.eof() && std::isdigit(ch, std::locale::classic())) {
        // We take into account only 10 digits, skip other. Older specs stated
        // that we must be ready for an arbitrary number of digits.
        if (i++ < 10) 
          keyword.val = keyword.val*10 + (ch - '0');
        stream.get(ch);
      }
      if (negative) keyword.val = -keyword.val;
    }
     // End of control word; the space is just a delimiter - skip it
    if (!stream.eof() && !std::isspace(ch, std::locale::classic()))
      stream.unget();
  }
  else { // Control symbol
    keyword.name[0] = ch, keyword.name[1] = 0;
  }
  return keyword;
}

Lexem GetLexem(std::istream& stream)
{
  Lexem result;
  // We always stay at the beginning of the next lexem or a crlf
  // If it's a brace then it's group begin/end
  // If it's a backslash -> Preprocess
  // - if it's a \u or \' -> make UTF16 character
  // - else it's a keyword -> Process (e.g., remember the codepage)
  // - (if the keyword is \bin then the following is #BDATA)
  // If it's some other character -> Preprocess
  // - if it's 0x09 -> it's the keyword \tab
  // - else it's a PCDATA
  char ch;
  while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r'))); // Skip crlf
  if (stream.eof())
    result = Lexem::eof;
  else {
    switch (ch) {
    case '{': // Group begin
    case '}': // Group end
      result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;
      break;
    case '\\': // Keyword
      result.SetKeyword(GetKeyword(stream));
      break;
    case '\t': // tab
      result.SetKeyword("tab");
      break;
    default: // PSDATA?
      result.SetPCDATA_A(ch);
      break;
    }
  }
  return result;
}

void PreprocessLexem(/*inout*/Lexem& lexem, std::istream& stream, int uc)
{
  if (lexem.type() == Lexem::ltKeyword) {
    if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof?
      lexem = Lexem::error;
    else if (eq(lexem.KeywordName(), "u")) {
       // Unicode character - get the UTF16 and skip the uc characters
      if (const int* val = lexem.KeywordVal()) {
        lexem.SetPCDATA_W(*val);
        stream.ignore(uc);
      }
      else lexem = Lexem::error;
    }
    else if (eq(lexem.KeywordName(), "'")) {
       // 8-bit character (\'hh) -> use current codepage
      char ch, ch1;
      if (!stream.get(ch).eof()) ch1 = HexToInt(ch);
      if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);
      lexem.SetPCDATA_A(ch1);
    }
    else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||
             eq(lexem.KeywordName(), "}")) // escaped characters
      lexem.SetPCDATA_A(lexem.KeywordName()[0]);
    else if (eq(lexem.KeywordName(), "bin")) {
      if (const int* i = lexem.KeywordVal()) {
        char* data = new char[*i];
        if (data) {
          stream.read(data, *i);
          if (stream.fail())
            lexem = Lexem::error;
          else
            lexem.SetBDATA(data, *i);
          delete[] data;
        }
        else lexem = Lexem::error;
      }
      else lexem = Lexem::error;
    }
    else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {
      // escaped cr or lf
      lexem.SetKeyword("par");
    }
  }
}

void UpdateState(const Lexem& lexem, /*inout*/GlobalState& globalState)
{
  switch (globalState.pcdata_a_state) {
  case GlobalState::pcdsfinished: // Last time we finished the pcdata
    globalState.pcdata_a_state = GlobalState::pcdsno;
    break;
  case GlobalState::pcdsin:
     // to be reset later if still in the pcdata
    globalState.pcdata_a_state = GlobalState::pcdsfinished;
    break;
  }

  switch (lexem.type()) {
  case Lexem::ltGroupBegin:
    globalState.stack.push(globalState.stack.top());
    break;
  case Lexem::ltGroupEnd:
    globalState.stack.pop();
    break;
  case Lexem::ltKeyword:
    {
      const int* val = lexem.KeywordVal();
      if (eq(lexem.KeywordName(), "ansi")) globalState.codepage = CP_ACP;
      else if (eq(lexem.KeywordName(), "mac")) globalState.codepage = CP_MACCP;
      else if (eq(lexem.KeywordName(), "pc")) globalState.codepage = 437;
      else if (eq(lexem.KeywordName(), "pca")) globalState.codepage = 850;
      else if (eq(lexem.KeywordName(), "ansicpg") && val)
        globalState.codepage = static_cast<unsigned int>(*val);
      else if (eq(lexem.KeywordName(), "deff") && val)
        globalState.deff = *val;
      else if (eq(lexem.KeywordName(), "fonttbl")) globalState.stack.top().fonttbl = true;
      else if (eq(lexem.KeywordName(), "f") && val) {
        globalState.stack.top().f = *val;
      }
      else if (eq(lexem.KeywordName(), "fcharset") &&
               globalState.stack.top().fonttbl &&
               (globalState.stack.top().f != -1) && val) {
        FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
        f.options |= FontInfo::has_fcharset;
        f.fcharset = *val;
      }
      else if (eq(lexem.KeywordName(), "cpg") && val) {
        if (globalState.stack.top().fonttbl && (globalState.stack.top().f != -1)) { // Defining a font
          FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
          f.options |= FontInfo::has_cpg;
          f.cpg = *val;
        }
        else { // Overriding the codepage for the block - may be in filenames
          globalState.stack.top().codepage = *val;
        }
      }
      else if (eq(lexem.KeywordName(), "plain"))
        globalState.stack.top().f = -1;
      else if (eq(lexem.KeywordName(), "uc") && val)
        globalState.stack.top().uc = *val;
    }
    break;
  case Lexem::ltPCDATA_A:
    if (globalState.pcdata_a_state == GlobalState::pcdsno) // Beginning of the pcdata
      globalState.pcdata_a_codepage = globalState.GetCurrentCP(); // to use later to convert to utf16
    globalState.pcdata_a_state = GlobalState::pcdsin;
    globalState.pcdata_a << lexem.pcdata_a();
    break;
  }
}

void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder)
{
  // Check if this is the rtf
  Lexem lexem = GetLexem(rtf);
  if (lexem.type() != Lexem::ltGroupBegin)
    return;
  decoder.BeginGroup();
  lexem = GetLexem(rtf);
  if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||
      !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))
    return;
  decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());

  GlobalState state(rtf);
  // Level is the count of elements in the stack

  while (!state.stream.eof() && (state.stack.size()>0)) { // Don't go past the global group
    lexem = GetLexem(state.stream);
    PreprocessLexem(lexem, state.stream, state.stack.top().uc);
    UpdateState(lexem, state);

    if (state.pcdata_a_state == GlobalState::pcdsfinished) {
      std::string s = state.pcdata_a.str();
      int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), 0, 0);
      if (sz) {
        wchar_t* data = new wchar_t[sz];
        ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), data, sz);
        decoder.PCDATA(data, sz);
        delete[] data;
      }
      state.pcdata_a.str(""); // reset
    }

    switch (lexem.type()) {
    case Lexem::ltGroupBegin:
      decoder.BeginGroup();
      break;
    case Lexem::ltGroupEnd:
      decoder.EndGroup();
      break;
    case Lexem::ltKeyword:
      decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
      break;
    case Lexem::ltPCDATA_W:
      {
        wchar_t ch = lexem.pcdata_w();
        decoder.PCDATA(&ch, 1);
      }
      break;
    case Lexem::ltBDATA:
      decoder.BDATA(lexem.bdata(), lexem.bdata_sz());
      break;
    case Lexem::ltError:
      break; // Just silently skip the erroneous data - basic error recovery
    }
  } // while
} // DecodeRTF