/******************************************************************************
*
* tei2mod.cpp - Utility to import documents encoded as TEI
*
* $Id$
*
* Copyright 2008-2013 CrossWire Bible Society (http://www.crosswire.org)
* CrossWire Bible Society
* P. O. Box 2528
* Tempe, AZ 85280-2528
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
*/
/******************************************************************************
* This program handles xml files of the form:
*
*
*
* ...
* ...
* ...
*
*
*
* The document is assumed to be well-formed and valid.
* Three kinds of entries are allowed,
* - a very restricted form of a dictionary entry.
* - a very unrestricted form of a dictionary entry.
* - an entry which can have other entries.
* The value of the key attribute is used as the key for the entry in the module.
* Note, for a only it's key becomes a SWORD key.
* Keys of entries internal to it are not used.
*
* The entries must be sorted according to an ASCII collation of their bytes.
* This should be the same for Latin-1 and for UTF-8
*
* Sword will allow for any tags, but only a few have any styling.
*
* author DM Smith
*/
#ifdef _MSC_VER
#pragma warning( disable: 4251 )
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef _ICU_
#include
#include
#endif
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
#endif
using namespace std;
#ifdef _ICU_
UTF8NFC *normalizer = 0;
int normalized = 0;
Latin1UTF8 converter;
int converted = 0;
#endif
#define DEBUG
SWLD *module = NULL;
SWKey *currentKey = NULL;
bool normalize = true;
SWBuf keyStr;
unsigned long entryCount = 0;
/**
* Determine whether the string contains a valid unicode sequence.
* The following table give the pattern of a valid UTF-8 character.
* Unicode Range 1st 2nd 3rd 4th
* U-00000000 - U-0000007F 0nnnnnnn
* U-00000080 - U-000007FF 110nnnnn 10nnnnnn
* U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
* U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
* Note:
* 1. The latest UTF-8 RFC allows for a max of 4 bytes.
* Earlier allowed 6.
* 2. The number of bits of the leading byte before the first 0
* is the total number of bytes.
* 3. The "n" are the bits of the unicode codepoint.
* This routine does not check to see if the code point is in the range.
* It could.
*
* param txt the text to check
* return 1 if all high order characters form a valid unicode sequence
* -1 if there are no high order characters.
* Note: this is also a valid unicode sequence
* 0 if there are high order characters that do not form
* a valid unicode sequence
* author DM Smith
*/
int detectUTF8(const char *txt) {
unsigned int countUTF8 = 0;
int count = 0;
// Cast it to make masking and shifting easier
const unsigned char *p = (const unsigned char*) txt;
while (*p) {
// Is the high order bit set?
if (*p & 0x80) {
// Then count the number of high order bits that are set.
// This determines the number of following bytes
// that are a part of the unicode character
unsigned char i = *p;
for (count = 0; i & 0x80; count++) {
i <<= 1;
}
// Validate count:
// Count 0: bug in code that would cause core walking
// Count 1: is a pattern of 10nnnnnn,
// which does not signal the start of a unicode character
// Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
// are not legal starts, either
if (count < 2 || count > 4) return 0;
// At this point we expect (count - 1) following characters
// of the pattern 10nnnnnn
while (--count && *++p) {
// The pattern of each following character must be: 10nnnnnn
// So, compare the top 2 bits.
if ((0xc0 & *p) != 0x80) return 0;
}
// Oops, we've run out of bytes too soon: Cannot be UTF-8
if (count) return 0;
// We have a valid UTF-8 character, so count it
countUTF8++;
}
// Advance to the next character to examine.
p++;
}
// At this point it is either UTF-8 or 7-bit ascii
return countUTF8 ? 1 : -1;
}
void normalizeInput(SWKey &key, SWBuf &text) {
#ifdef _ICU_
int utf8State = detectUTF8(text.c_str());
if (normalize) {
// Don't need to normalize text that is ASCII
// But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
if (!utf8State) {
cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
converted++;
// Prepare for double check. This probably can be removed.
// But for now we are running the check again.
// This is to determine whether we need to normalize output of the conversion.
utf8State = detectUTF8(text.c_str());
}
// Double check. This probably can be removed.
if (!utf8State) {
cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
}
if (utf8State > 0) {
SWBuf before = text;
normalizer->processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
if (before != text) {
normalized++;
}
}
}
#endif
}
void writeEntry(SWKey &key, SWBuf &text) {
#ifdef DEBUG
cout << "(" << entryCount << ") " << key << endl;
#endif
module->setKey(key);
normalizeInput(key, text);
module->setEntry(text);
}
void linkToEntry(const SWBuf &keyBuf, const SWBuf &linkBuf) {
SWKey tmpkey = linkBuf.c_str();
module->linkEntry(&tmpkey);
#ifdef DEBUG
cout << "(" << entryCount << ") " << "Linking: " << linkBuf << endl;
#endif
}
// Return true if the content was handled or is to be ignored.
// false if the what has been seen is to be accumulated and considered later.
bool handleToken(SWBuf &text, XMLTag *token) {
// The start token for the current entry;
static XMLTag startTag;
// Flags to indicate whether we are in a entry, entryFree or superentry
static bool inEntry = false;
static bool inEntryFree = false;
static bool inSuperEntry = false;
const char *tokenName = token->getName();
static const char *splitPtr, *splitPtr2 = NULL;
static char *splitBuffer = new char[4096];
static SWKey tmpKey;
//-- START TAG -------------------------------------------------------------------------
if (!token->isEndTag()) {
// If we are not in an "entry" and we see one, then enter it.
if (!inEntry && !inEntryFree && !inSuperEntry) {
inEntry = !strcmp(tokenName, "entry");
inEntryFree = !strcmp(tokenName, "entryFree");
inSuperEntry = !strcmp(tokenName, "superentry");
if (inEntry || inEntryFree || inSuperEntry) {
#ifdef DEBUG
cout << "Entering " << tokenName << endl;
#endif
startTag = *token;
text = "";
keyStr = token->getAttribute("n"); // P5 with linking and/or non-URI chars
if (!strlen(keyStr)) {
keyStr = token->getAttribute("sortKey"); // P5 otherwise
if (!strlen(keyStr)) {
keyStr = token->getAttribute("key"); // P4
}
}
return false; // make tag be part of the output
}
}
}
//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
else {
// ENTRY end
// If we see the end of an entry that we are in, then leave it
if ((inEntry && !strcmp(tokenName, "entry" )) ||
(inEntryFree && !strcmp(tokenName, "entryFree" )) ||
(inSuperEntry && !strcmp(tokenName, "superentry"))) {
#ifdef DEBUG
cout << "Leaving " << tokenName << endl;
#endif
// Only one is false coming into here,
// but all must be on leaving.
inEntry = false;
inEntryFree = false;
inSuperEntry = false;
text += token->toString();
entryCount++;
#ifdef DEBUG
cout << "keyStr: " << keyStr << endl;
#endif
splitPtr = strstr(keyStr, "|");
if (splitPtr) {
strncpy (splitBuffer, keyStr.c_str(), splitPtr - keyStr.c_str());
splitBuffer[splitPtr - keyStr.c_str()] = 0;
*currentKey = splitBuffer;
#ifdef DEBUG
cout << "splitBuffer: " << splitBuffer << endl;
cout << "currentKey: " << *currentKey << endl;
#endif
writeEntry(*currentKey, text);
#if 1
while (splitPtr) {
splitPtr += 1;
splitPtr2 = strstr(splitPtr, "|");
entryCount++;
if (splitPtr2) {
strncpy (splitBuffer, splitPtr, splitPtr2 - splitPtr);
splitBuffer[splitPtr2 - splitPtr] = 0;
#ifdef DEBUG
cout << "splitBuffer: " << splitBuffer << endl;
cout << "currentKey: " << *currentKey << endl;
#endif
linkToEntry(currentKey->getText(), splitBuffer);
splitPtr = splitPtr2;
}
else {
strcpy (splitBuffer, splitPtr);
#ifdef DEBUG
cout << "splitBuffer: " << splitBuffer << endl;
cout << "currentKey: " << *currentKey << endl;
#endif
linkToEntry(currentKey->getText(), splitBuffer);
splitPtr = 0;
}
}
#endif
}
else {
*currentKey = keyStr;
writeEntry(*currentKey, text);
}
// Since we consumed the text, clear it
// and tell the caller that the tag was consumed.
text = "";
return true;
}
}
return false;
}
void usage(const char *app, const char *error = 0) {
if (error) fprintf(stderr, "\n%s: %s\n", app, error);
fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for\n\tThe SWORD Project\n");
fprintf(stderr, "\nusage: %s