Marc Ermshaus’ avatar

Marc Ermshaus

Linkblog

Algorithmic Advent: 17 – Displaying the structure of an XML document

Published on 17 Dec 2010. Tagged with php, algorithmicadvent.

<?php

function nodeTypeToString($nodeType)
{
    $map = array(
         1 => 'XML_ELEMENT_NODE',
         2 => 'XML_ATTRIBUTE_NODE',
         3 => 'XML_TEXT_NODE',
         4 => 'XML_CDATA_SECTION_NODE',
         5 => 'XML_ENTITY_REFERENCE_NODE',
         6 => 'XML_ENTITY_NODE',
         7 => 'XML_PROCESSING_INSTRUCTION_NODE',
         8 => 'XML_COMMENT_NODE',                 //
         9 => 'XML_DOCUMENT_NODE',
        10 => 'XML_DOCUMENT_TYPE_NODE',           //
        11 => 'XML_DOCUMENT_FRAGMENT_NODE',
        12 => 'XML_NOTATION_NODE'
    );

    if (isset($map[$nodeType])) {
        return $map[$nodeType];
    }

    return 'UNKNOWN';
}

header('Content-Type: text/html; charset=UTF-8');

$doc = new DOMDocument();
$doc->preserveWhiteSpace = false;

$doc->loadXML('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
     <!DOCTYPE html [
 <!ELEMENT test (#PCDATA) >
 <!ENTITY % xx "&#37;zz;">
 <!ENTITY % zz "&#60;">
 ]>
    <html>
    <div>Some text<!--a comment-->
    <strong class="test">some <![CDATA[<tag>]]> more text</strong>
    </div></html>');

function rec(DOMNode $node, $indent = 0)
{
    if ($node->hasChildNodes()) {
        foreach ($node->childNodes as $child) {
            echo '<tr>';
            echo '<td>' . str_repeat('&nbsp;', $indent)
                    . $child->nodeName . '</td>';
            echo '<td>' . nodeTypeToString($child->nodeType) . '</td>';

            $nv = htmlspecialchars($child->nodeValue);

            $nv = str_replace(array("\n", "\r", "\t", ' '),
                              array('\n', '\r', '\t', '&nbsp;'), $nv);

            if ($nv === '') {
                $nv = '{empty}';
            } else {
                $nv = '"' . $nv . '"';
            }

            echo '<td>' . $nv . '</td>';
            echo '</tr>';
            rec($child, $indent + 4);
        }
    }
}

echo '<table border="1">';
echo '<tr><th>nodeName</th><th>nodeType</th><th>nodeValue</th></tr>';
rec($doc);
echo '</table>';