diff options
Diffstat (limited to 'l10ntools/source/wtratree.cxx')
-rw-r--r-- | l10ntools/source/wtratree.cxx | 420 |
1 files changed, 420 insertions, 0 deletions
diff --git a/l10ntools/source/wtratree.cxx b/l10ntools/source/wtratree.cxx new file mode 100644 index 000000000000..7e3bf4fda724 --- /dev/null +++ b/l10ntools/source/wtratree.cxx @@ -0,0 +1,420 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +// MARKER(update_precomp.py): autogen include statement, do not remove +#include "precompiled_l10ntools.hxx" + + +#include "wtratree.hxx" + + + +/** @ATTENTION + For reasons of speed, class WordTransTree works with two simple + char arrays, sOutput and sInput, instead of secure containers or + streams. So be extremely careful, when changing this code!!! +**/ + + + +// NOT FULLY DECLARED SERVICES +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#include "wtranode.hxx" + + +const BRANCH_T BR_END = 0; +const BRANCH_T BR_NONALPHA = 1; +const BRANCH_T BR_HOTKEY = 2; +const BRANCH_T BR_BACKSLASH = 3; +const BRANCH_T BR_ALPHABASE = 4; /// @ATTENTION All branches not valid for words must be smaller than this value! +const BRANCH_T BR_AE = 30; +const BRANCH_T BR_OE = 31; +const BRANCH_T BR_UE = 32; +const BRANCH_T BR_SZ = 33; +const BRANCH_T BR_MAX = 34; /// @ATTENTION Must be updated always! + +const BRANCH_T BR_START = 0; + + + + + +WordTransTree::WordTransTree(CharSet i_nWorkingCharSet) + : sInput(0), + nInputLength(0), + pInputEnd(0), + sOutput(0), + nOutputMaxLength(0), + dpParsingTreeTop(0), + pUnknownAlpha(0), + // cChar2Branch + c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')), + c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')), + pInputCurTokenStart(0), + pInputPosition(0), + pOutputPosition(0), + pCurParseNode(0), + eCurResult(OK), + cCurHotkey(0), + cCurHotkeySign(u_char('~')) +{ + // Initialize parsing tree: + pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree. + for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++) + { + pUnknownAlpha->SetBranch(i,pUnknownAlpha); + } // end for + + dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha); + + WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0); + + dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha); + dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha); + + WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha); + dpBackslash->SetBranch(BR_END,0); + + dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash); + dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash); + + + // Initialize character set: + SetCharSet(i_nWorkingCharSet); + + if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX) + { + fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__, __LINE__); + exit(1); + } +} + +void +WordTransTree::SetCharSet(CharSet i_nWorkingCharSet) +{ + ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF"); + const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() ); + + INT16 i = 0; + for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i ) + { + cChar2Branch[i] = BR_NONALPHA; + } // end for + for ( i = 'a'; i <= 'z'; ++i ) + { + cChar2Branch[i] = BR_ALPHABASE + i - 'a'; + } // end for + for ( i = 'A'; i <= 'Z'; ++i ) + { + cChar2Branch[i] = BR_ALPHABASE + i - 'A'; + } // end for + cChar2Branch[pConvert[0]] = BR_AE; + cChar2Branch[pConvert[1]] = BR_OE; + cChar2Branch[pConvert[2]] = BR_UE; + cChar2Branch[pConvert[3]] = BR_AE; + cChar2Branch[pConvert[4]] = BR_OE; + cChar2Branch[pConvert[5]] = BR_UE; + cChar2Branch[pConvert[6]] = BR_SZ; + + cChar2Branch[u_char('~')] = BR_HOTKEY; + cChar2Branch[u_char('&')] = BR_HOTKEY; + + + c_AE = pConvert[0]; + c_OE = pConvert[1]; + c_UE = pConvert[2]; + c_ae = pConvert[3]; + c_oe = pConvert[4]; + c_ue = pConvert[5]; +} + +WordTransTree::~WordTransTree() +{ + delete dpParsingTreeTop; + if (sOutput != 0) + delete [] sOutput; +} + +void +WordTransTree::AddWordPair( const ByteString & i_sOldString, + const ByteString & i_sReplaceString ) +{ + if (i_sOldString.Len() == 0) + return; + + pCurParseNode = dpParsingTreeTop; + WTT_Node * pBranch = 0; + char cBranch = 0; + + for ( constr pOld = i_sOldString.GetBuffer(); + *pOld != 0; + pOld++ ) + { + cBranch = CalculateBranch(*pOld); + pBranch = pCurParseNode->GetNextNode(cBranch); + if (pBranch == 0 || pBranch == pUnknownAlpha) + { + pBranch = new WTT_Node(cBranch,0,pUnknownAlpha); + pCurParseNode->SetBranch(cBranch,pBranch); + } + pCurParseNode = pBranch; + } // end for + pCurParseNode->SetAsTokenToReplace(i_sReplaceString); +} + +void +WordTransTree::InitTransformation( const char * i_sInput, + UINT32 i_nInputLength, + UINT32 i_nOutputMaxLength ) +{ + sInput = (const u_char *)i_sInput; + nInputLength = i_nInputLength; + pInputEnd = &sInput[i_nInputLength]; + + pInputCurTokenStart = sInput; + pInputPosition = sInput; + + if (nOutputMaxLength < i_nOutputMaxLength) + { + if (sOutput != 0) + delete [] sOutput; + sOutput = new unsigned char[i_nOutputMaxLength]; + nOutputMaxLength = i_nOutputMaxLength; + } + pOutputPosition = sOutput; +} + +/** pInputCurTokenStart and CurParseNode are updated just when + starting this function. After its end they must not be changed + till this functon is called again. + Outside this function pInputPositon and pOutputPosition are both + on the first not transformed char in their respective array. +**/ +WordTransTree::E_Result +WordTransTree::TransformNextToken() +{ + pInputCurTokenStart = pInputPosition; + pCurParseNode = dpParsingTreeTop; + cCurHotkey = 0; + eCurResult = OK; + + WTT_Node * pBranch = 0; + UINT8 cBranch = 0; + + for ( pCurParseNode = dpParsingTreeTop; + pInputPosition != pInputEnd; + ++pInputPosition ) + { + cBranch = CalculateBranch(*pInputPosition); + pBranch = pCurParseNode->GetNextNode( cBranch ); + if (pBranch != 0) + { + pCurParseNode = pBranch; + } + else + { + if (cBranch == BR_HOTKEY) // current letter is '~' or '&'. + { + // Logic of the following. There are 9 possible cases - + // A = alphabetic letter, NA = non alphabetic, TB = token begin, + // Eot = end of text: + // 1. A~A set hotkey to following letter, continue + // 2. A~NA token end + // 3. A~Eot token end + // 4. NA~A token end + // 5. NA~NA continue + // 6. A~Eof continue + // 7. TB~A set hotkey to following letter, continue + // 8. TB~NA continue + // 9. TB~Eot continue + + // bNext and Prev are true, if there are alphabetic letters: + sal_Bool bNext = pInputPosition + 1 != pInputEnd + ? CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE + : sal_False; + sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE; + + if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) ) + { // case 1. and 7. + Handle_Hotkey(); + continue; + } + else if (!bPrev && !bNext) + { // case 5.,6.,8.,9. + continue; + } + + // Case 2.,3.,4. : + // so this should be handled as an end of a token. + } + if (pCurParseNode->TokenType() == WTT_Node::token_to_keep) + { + Handle_TokenToKeep(); + return eCurResult; + } + else + { + Handle_TokenToTransform(); + return eCurResult; + } // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep) + } // endif (pBranch == 0) else + } // end for + + // If here, the text end is reached + if (pCurParseNode->TokenType() == WTT_Node::token_to_keep) + { + Handle_TokenToKeep(); + return eCurResult; + } + else + { + Handle_TokenToTransform(); + return eCurResult; + } +} + +ByteString +WordTransTree::CurReplacingString() const +{ + return pCurParseNode->ReplaceString(); +} + +void +WordTransTree::Handle_Hotkey() +{ + if (cCurHotkey == 0) // Avoid to replace the first found hotkey by + // a later one - though this shouldn't happen anyway. + { + cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0; + cCurHotkeySign = *pInputPosition; + } +} + +void +WordTransTree::Handle_TokenToKeep() +{ + UINT32 nTokenLength = pInputPosition-pInputCurTokenStart; + + memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength); + + pOutputPosition += nTokenLength; + *pOutputPosition = '\0'; +} + +void +WordTransTree::Handle_TokenToTransform() +{ + sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE; + const ByteString & rReplace = pCurParseNode->ReplaceString(); + + // Find position of hotkey in replace-string: + sal_uInt16 nHotkeyPos = bHaveHotkey + ? rReplace.Search(char(cCurHotkey)) + : STRING_NOTFOUND; + if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey) + { + if (cCurHotkey < 128) + { + if (islower(cCurHotkey)) + nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey))); + else + nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey))); + } + else // cCurHotkey >= 128 + { + if (cCurHotkey == c_ae) + nHotkeyPos = rReplace.Search(char(c_AE)); + else if (cCurHotkey == c_oe) + nHotkeyPos = rReplace.Search(char(c_OE)); + else if (cCurHotkey == c_ue) + nHotkeyPos = rReplace.Search(char(c_UE)); + else if (cCurHotkey == c_AE) + nHotkeyPos = rReplace.Search(char(c_ae)); + else if (cCurHotkey == c_OE) + nHotkeyPos = rReplace.Search(char(c_oe)); + else if (cCurHotkey == c_UE) + nHotkeyPos = rReplace.Search(char(c_ue)); + } // endif (cCurHotkey < 128) else + + if (nHotkeyPos == STRING_NOTFOUND) + { + eCurResult = HOTKEY_LOST; + bHaveHotkey = sal_False; + } + } // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey) + + + UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0); + + if (bHaveHotkey) + { + memcpy( pOutputPosition, + pCurParseNode->ReplaceString().GetBuffer(), + nHotkeyPos ); + *(pOutputPosition + nHotkeyPos) = cCurHotkeySign; + memcpy( pOutputPosition + nHotkeyPos + 1, + pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos, + nOutputTokenLength - nHotkeyPos - 1); + } + else + { + memcpy( pOutputPosition, + pCurParseNode->ReplaceString().GetBuffer(), + nOutputTokenLength ); + } + + // Convert first letter into upper if necessary: + u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY + ? pInputCurTokenStart[1] + : pInputCurTokenStart[0] ; + u_char * pOutStart = nHotkeyPos == 0 + ? pOutputPosition + 1 + : pOutputPosition ; + if (isupper(cInStart) || cInStart > 127) + { // Possibly cInStart is upper character: + if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE) + { // Surely cInStart is upper character: + u_char cOutStart = *pOutStart; + if (cOutStart < 128) + *pOutStart = toupper(cOutStart); + else if (cOutStart == c_ae) + *pOutStart = c_AE; + else if (cOutStart == c_oe) + *pOutStart = c_OE; + else if (cOutStart == c_ue) + *pOutStart = c_UE; + } + } // endif (isupper(cInStart) || cInStart > 127) + + pOutputPosition += nOutputTokenLength; + *pOutputPosition = '\0'; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |