/*----------------------------------------------------------------------------*\ * Copyright (c) 1999-2003 CubeWerx Inc. Licensed under the GNU LGPL. * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 2.1 of the License, * or any later version. This library is distributed in the hope that * it will be useful, but WITHOUT ANY WARRANTY, without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details, either * in the "LICENSE.LGPL.txt" file distributed with this software or at * web page "http://www.fsf.org/licenses/lgpl.html". * * MODULE: cw_xmlscan.h * PURPOSE: XML scanner * HISTORY: * DATE PROGRAMMER DESCRIPTION * 21-Aug-1999 Craig Bruce Initial creation * 23-Aug-1999 Craig Bruce Reworked entity API * 02-Nov-2000 Craig Bruce Significant API upgrades * 01-May-2003 Craig Bruce Rewrote for BXML support * 20-Jun-2003 Craig Bruce Reorganized into "cwxml" module *---------------------------------------------------------------------------- * $Id: cw_xmlscan.h,v 1.9 2004/02/27 21:14:58 csbruce Exp $ \*----------------------------------------------------------------------------*/ #include #include #ifndef _CW_XMLSCAN_H #define _CW_XMLSCAN_H #ifdef __cplusplus extern "C" { #endif /*============================================================================*\ * SCANNER STRUCTURES \*============================================================================*/ /*----------------------------------------------------------------------------*\ * CW_XML_SCAN_STATE - current state of scanner \*----------------------------------------------------------------------------*/ typedef enum { CW_XML_SCAN_STATE_IN_CONTENT, /* inside of general content */ CW_XML_SCAN_STATE_AT_TAG_START, /* at start of '<' tag */ CW_XML_SCAN_STATE_IN_TAG, /* inside of element-open tag, w/attrs */ CW_XML_SCAN_STATE_AT_ENTITY_START, /* at start of '&' entity */ CW_XML_SCAN_STATE_AT_BXML_TAG_END /* synthesize no-attr tag finish: bxml*/ } CW_XML_SCAN_STATE; /*----------------------------------------------------------------------------*\ * CW_XML_SCAN - context for scanning - do not access fields directly! \*----------------------------------------------------------------------------*/ #define CW_XML_SCAN_BUFSIZE 1024 /* small enough for L-1 cache */ typedef struct CW_XML_SCAN CW_XML_SCAN; /* forward definition */ struct CW_XML_SCAN { /** file handling **/ FILE *file; /* file to read from */ bool closeFile; /* flag to close file when finished */ bool localGZipIsActive; /* flag if gzip decoding is activated locally*/ byte buffer[CW_XML_SCAN_BUFSIZE]; /* buffer for input data */ byte *bufPtr; /* scan position within input buffer */ byte *bufLimit; /* limit position of remaining data in buffer */ long scanLineNum; /* text line of current character */ long scanLineByte; /* consumed length of text line */ long nodeLineNum; /* line of new node */ long nodeLineByte; /* line position of new node */ /** processing settings **/ bool trimWhitespace; /* flag to trim unnecessary content whitespace */ bool trimAttrSpace; /* flag to trim whitespace from attributes */ bool isBxml; /* am scanning Binary XML */ long bxmlVersion; /* version of BXML stream */ bool mustSwapEndian; /* must swap endian of bxml words */ char *charEncoding; /* character-encoding id string */ bool decodeUtf8; /* flag to decode UTF-8 text */ /** token handling **/ CW_XML_DOC *document; /* xml document */ CW_XML_NODE *nodeRef; /* ref to master scanning node in doc tree */ bool nodeIsUngotten; /* the node has been un-gotten */ bool trimContLeadWhite; /* trim leading whitespace from next content */ void *nodePool; /* pool of nodes available for re-use */ bool atDocStart; /* flag for haven't read any nodes yet */ const CW_XML_NODE *(*tokenReader)( CW_XML_SCAN *xmlScan, CW_XML_NODE *node ); /* tokenizer method */ CW_XML_SCAN_STATE scanState;/* current scan state */ bool inAttribute; /* flag for inside of attribute */ char attrCloseChar; /* attribute-close character */ bool synthesizeClose; /* flag to synthesize close of empty element */ uint32 openTagFlags; /* empty/attr flags from last opening tag */ bool haveFetchedSubtree; /* have already fetched node subtree */ CW_STR_OBJ*scanString; /* reusable scanning string */ long xmlLevel; /* current xml level */ CW_XML_STR_TAB_ENT **xmlLevelTags; /* ext-str-table entries for tag levels*/ long xmlLevelAlloc; /* depth of xml-levels allocated */ } /*CW_XML_SCAN*/; /*============================================================================*\ * SCANNER INITIALIZATION & CONTROL FUNCTIONS \*============================================================================*/ /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_IsXmlDocument() - determine if MIME type is an XML document * DESCRIPTION: * Determines whether the given MIME type is for an XML-based format. * ARGUMENTS: * mimeType - given MIME type * RETURNS: * isXml - flag indicating if MIME type is for XML document * ERRORS: * (no errors are possible) \*----------------------------------------------------------------------------*/ CWEXP bool CwXmlScan_IsXmlDocument( const char *mimeType ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_Init() - start XML scanner on a file stream * DESCRIPTION: * Starts the XML scanner on a file stream. The stream will be * checked for GZIP and/or BXML data. * ARGUMENTS: * file - file stream to read from * RETURNS: * xmlScan - xml-scan object, or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - file/gzip/bxml errors \*----------------------------------------------------------------------------*/ CWEXP CW_XML_SCAN *CwXmlScan_Init( FILE *file ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_InitString() - start XML scanner on a string * DESCRIPTION: * Starts the XML scanner on a memory buffer/string. The stream * will be checked for GZIP and/or BXML data. The buffer/string must * persist for the duration of the XML-scanning activity. * ARGUMENTS: * xmlString - the string containing the XML data * xmlStringLen - length of the data string, or -1=auto * RETURNS: * xmlScan - xml-scan object, or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - pseudo-file errors * - gzip/bxml errors \*----------------------------------------------------------------------------*/ CWEXP CW_XML_SCAN *CwXmlScan_InitString( const char *xmlString, long xmlStringLen ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_Finish() - finish XML scanner on a file stream * DESCRIPTION: * Finishes the XML scanner on a file stream. Local GZIP decompression * will be disabled. The scanner resources will be released even if * an error is returned. The document attached to the scanner will * be destroyed. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * err - 0 on success or -1 on error * ERRORS: * - malloc errors are possible if they are enabled * - file/gzip errors \*----------------------------------------------------------------------------*/ CWEXP int CwXmlScan_Finish( CW_XML_SCAN *xmlScan ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_FinishAndDetachDocument() - finish scanning, return document * DESCRIPTION: * Finishes the XML scanner on a file stream. Local GZIP decompression * will be disabled. The scanner resources will be released even * if an error is returned. The document attached to the scanner * will be detached and returned to you, unless there is an error, * in which case it will be destroyed. You will ultimately need to * destroy the document using CwXmlDoc_Destroy(). * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * document - document object or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - file/gzip errors \*----------------------------------------------------------------------------*/ CWEXP CW_XML_DOC *CwXmlScan_FinishAndDetachDocument( CW_XML_SCAN *xmlScan ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SetWhitespaceTrimming() - set content-whitespace trimming * DESCRIPTION: * Sets the mode for trimming leading and trailing whitespace from * regular XML content. If the passed 'flag' is TRUE, whitespace * will be trimmed from the normal XML-input content, FALSE, not. * The trimming of whitespace in attribute values is controlled * independently. * ARGUMENTS: * xmlScan - xml-scanner object * flag - flag value to set trimming mode to * RETURNS: * prevFlag - previous trimming-mode flag * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP bool CwXmlScan_SetWhitespaceTrimming( CW_XML_SCAN *xmlScan, bool flag ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SetAttributeWhitespaceTrimming() - set attr-whitespace trimming * DESCRIPTION: * Sets the mode for trimming leading and trailing whitespace from * XML attribute content. If the passed 'flag' is TRUE, whitespace * will be trimmed from attribute content, FALSE, not. * ARGUMENTS: * xmlScan - xml-scanner object * flag - flag value to set trimming mode to * RETURNS: * prevFlag - previous trimming-mode flag * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP bool CwXmlScan_SetAttributeWhitespaceTrimming( CW_XML_SCAN *xmlScan, bool flag ); /*============================================================================*\ * WHOLE-DOCUMENT-SCANNING (DOM) INTERFACE \*============================================================================*/ /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_ReadWholeDocument() - read whole XML document into memory tree * DESCRIPTION: * Reads an entire XML document into a tree of nodes in memory. * A pointer to the root node of the tree is returned and the tree may * then be accessed using the "cw_xmltree.h" interface. Abstractly, * this is the DOM method of XML processing. * * Be careful about trying to read/process extremely large XML * documents in this way because you may run out of memory. This call * will fail if the document has already been accessed using the * node/subtree-scanning interface. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * node - XML node with complete subtree, or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - improper scanning state * - file/gzip errors * - xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP const CW_XML_NODE *CwXmlScan_ReadWholeDocument( CW_XML_SCAN *xmlScan ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_GetDocument() - locate document object being scanned * DESCRIPTION: * Returns a reference to the document the scanner is scanning into. * ARGUMENTS: * xmlScan - scanner object * RETURNS: * doc - pointer to document object * ERRORS: * (no errors are possible) \*----------------------------------------------------------------------------*/ #define CwXmlScan_GetDocument(xmlScan) ((const CW_XML_DOC *)(xmlScan)->document) /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_LoadDocument() - load a document into memory given filename * DESCRIPTION: * This is a convenience function that loads a complete XML document * into memory given its filename. The returned document will be * independent of any scanning environment. Default scanner settings * will be used. You will ultimately need to destroy the document * using CwXmlDoc_Destroy(). Use CwXmlDoc_GetRootNode() to locate * the root node of the document. * ARGUMENTS: * filename - name of file to load * RETURNS: * document - document object or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - XML-scanning errors * - file/gzip errors \*----------------------------------------------------------------------------*/ CWEXP CW_XML_DOC *CwXmlScan_LoadDocument( const char *filename ); /*============================================================================*\ * NODE/SUBTREE-SCANNING INTERFACE \*============================================================================*/ /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_ReadNode() - read XML node from input stream * DESCRIPTION: * Reads the next XML node/token from the scanning stream. For most * node/token types, no subtree is read, but for the elements, the * subtree of only its attributes is read. You can access an element's * attributes using the "cw_xmltree.h" interface. To read the rest of * the subtree of an element, you must call CwXmlScan_ReadSubTree(), * but be careful of arbitrarily large subtrees. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * node - new XML node, or NULL on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP const CW_XML_NODE *CwXmlScan_ReadNode( CW_XML_SCAN *xmlScan ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_ReadSubTree() - read subtree of present XML node * DESCRIPTION: * Reads the subtree of the present XML node that was most recently * read by CwXmlScan_ReadNode(). If the present node is not an * element-opening tag, then this function does nothing. If no nodes * have been read from the input stream yet, this function returns * an error. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * node - XML node with complete subtree, or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - file/gzip errors * - xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP const CW_XML_NODE *CwXmlScan_ReadSubTree( CW_XML_SCAN *xmlScan ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_GetCurrentNode - locate the current node * DESCRIPTION: * Returns a pointer to the node object that was last scanned. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * node - XML node * ERRORS: * (no errors are possible) \*----------------------------------------------------------------------------*/ #define CwXmlScan_GetCurrentNode(xmlScan) ((xmlScan)->nodeRef) /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_UngetNode() - un-read the last XML node to be read again * DESCRIPTION: * 'Ungets' the presently scanned XML node so that it will be the * next node returned by CwXmlScan_ReadNode(). Only one node can be * ungotten at a time. * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * (nothing) * ERRORS: * (no errors are possible) \*----------------------------------------------------------------------------*/ #define CwXmlScan_UngetNode(xmlScan) ((xmlScan)->nodeIsUngotten = TRUE) /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SkipToElementFull() - skip ahead to indicated element * DESCRIPTION: * Skips ahead until one of the identified elements is encountered * at the present XML nesting level. (Future optimizations may * make this call much more efficient than sequential reading.) * Multiple elements may be given in the 'elementName' by separating * each name with a '|' character (e.g., "ogc:Title|ogc:Abstract"). * An element-name value of NULL will match any element. If the * 'isRequired' flag is TRUE, then one of the identified elements * must be encountered within the current nesting level or an * error is raised. If the 'nextElementMustMatch' flag is TRUE, * then the first element encountered must match the given name(s). * The name-position starting from 1 of the match is returned, or 0 if * no match is found, or -1 on error. If a match 'isRequired', then * a return of 0 is impossible. If the element is not found, then the * input stream will be repositioned to the closing tag of the starting * XML-nesting level. You can locate the matching element node for * future operations using the CwXmlScan_GetCurrentNode() call. * BUGS: * Presently, only tag basenames are compared, so "ogc:Title" will * match "bob:Title". * ARGUMENTS: * xmlScan - xml-scanner object * elementName - element name or names, may be NULL * isRequired - a match is required to be found, or error return * nextElementMustMatch - the next element encountered must match * RETURNS: * matchPos = name-match position starting from 1, 0 if no match found, * or -1 on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP long CwXmlScan_SkipToElementFull( CW_XML_SCAN *xmlScan, const char *elementName, bool isRequired, bool nextElementMustMatch ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SkipToElement() - skip ahead to indicated element * DESCRIPTION: * Skips ahead in the input stream to the identified element, which * need not actually be present. See CwXmlScan_SkipToElementFull() * for description. * ARGUMENTS: * xmlScan - xml-scanner object * elementName - element name, may be NULL * RETURNS: * matchPos = name-match position starting from 1, 0 if no match found, * or -1 on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ #define CwXmlScan_SkipToElement(xmlScan,elementName) \ CwXmlScan_SkipToElementFull((xmlScan), (elementName), FALSE, FALSE) /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SkipToRequiredElement() - skip ahead to indicated element * DESCRIPTION: * Skips ahead in the input stream to the identified element, * which must be present. See CwXmlScan_SkipToElementFull() for * description. The element does not need to be the next element * present in the stream. * ARGUMENTS: * xmlScan - xml-scanner object * elementName - element name, may be NULL * RETURNS: * node - indicated XML-element node, or NULL on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP const CW_XML_NODE *CwXmlScan_SkipToRequiredElement( CW_XML_SCAN *xmlScan, const char *elementName ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SkipToRequiredNextElement() - skip ahead to indicated element * DESCRIPTION: * Skips ahead in the input stream to the identified element, * which must be present and must be the next element to appear. * See CwXmlScan_SkipToElementFull() for description. * ARGUMENTS: * xmlScan - xml-scanner object * elementName - element name, may be NULL * RETURNS: * node - indicated XML-element node, or NULL on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP const CW_XML_NODE *CwXmlScan_SkipToRequiredNextElement( CW_XML_SCAN *xmlScan, const char *elementName ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_SkipToCloseTag() - skip ahead to indicated element-close tag * DESCRIPTION: * Skips ahead in the input stream until the identified element-closing * tag is encountered that is above the present XML-nesting level. * A NULL element name will match any close tag, effectively meaning * to skip past the present-level subtree. Scanning will be stopped * at the identified closer node or an error will be returned. * ARGUMENTS: * xmlScan - xml-scanner object * elementName - element name, NULL = any * RETURNS: * err - 0 on success or -1 on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP int CwXmlScan_SkipToCloseTag( CW_XML_SCAN *xmlScan, const char *elementName ); /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_ReadTextContent() - read text content of an element * DESCRIPTION: * Reads the content of the current element. (The scanner must be * positioned to have just scanned an element-opening tag.) The text * is returned. The text will include all of the content of the * element subtree as defined in the CwXmlNode_GetText() function. * No Double, Long, or Bool versions of this function are provided, * but you can call CwXmlScan_ReadSubTree() instead and extract the * content using the CwXmlNode_...() access methods. * ARGUMENTS: * xmlScan - xml-scanner object, positioned on element-opening tag * outText - (out) text content read in, NULL if there is no content * RETURNS: * err - 0 on success or -1 on error * ERRORS: * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ CWEXP int CwXmlScan_ReadTextContent( CW_XML_SCAN *xmlScan, const char **outText ); /*============================================================================*\ * RAW TOKENIZER - normal users shouldn't use this \*============================================================================*/ /*----------------------------------------------------------------------------*\ * NAME: * CwXmlScan_ReadRawToken() - read next raw XML token * DESCRIPTION: * Reads the next raw XML token from the input stream. Raw tokens * are lower-level than "nodes" in that attributes of elements are * not read together and other niceties and error checking is not * performed. You must not mix calling this function with using * the higher-level mechanism, not that this should ever be needed, * because this function is highly optimized and some things won't * work properly. This function should only be used for brutal * efficiency when combining the tokenizer of the cwxml scanner with * another XML-parsing library. * * The contents of some tokens are a little different from the * equivalent nodes, which are described with the CW_XML_NODE * structure. XXX CW_XML_NODE_ELEMENT. CW_XML_NODE_ELEMENT_FINISH * present. CW_XML_NODE_CLOSE. CW_XML_NODE_TEXT. * * ARGUMENTS: * xmlScan - xml-scanner object * RETURNS: * token - token read, or NULL on error * ERRORS: * - malloc errors are possible if they are enabled * - file/gzip/xml/bxml errors \*----------------------------------------------------------------------------*/ #define CwXmlScan_ReadRawToken(xmlScan) \ ( (xmlScan)->tokenReader((xmlScan), (xmlScan)->nodeRef) ) #ifdef __cplusplus }; #endif #endif /*----------------------------------------------------------------------------*\ * END OF MODULE: cw_xmlscan.h \*----------------------------------------------------------------------------*/