+
+namespace {
+
+// Remove document fragments (ie. move their content up in the parent node)
+// and combine neighboring text nodes into one.
+void normalize_node(xmlNodePtr node)
+{
+ xmlNode *next_child;
+ for (xmlNode *child = node->children; child != NULL; child = next_child) {
+ next_child = child->next;
+ if (child->type == XML_DOCUMENT_FRAG_NODE) {
+ while (child->children != NULL) {
+ xmlAddPrevSibling(child, child->children);
+ }
+
+ xmlUnlinkNode(child);
+ xmlFreeNode(child);
+ }
+ }
+
+ // xmlAddPrevSibling merges adjacent text nodes, but many other things
+ // (including xmlUnlinkNode) do not, so make an extra pass.
+ for (xmlNode *child = node->children; child != NULL; child = child->next) {
+ while (child->type == XML_TEXT_NODE && (child->next != NULL && child->next->type == XML_TEXT_NODE)) {
+ xmlNode *next_child = child->next;
+
+ xmlChar *content = xmlNodeGetContent(next_child);
+ xmlNodeAddContent(child, content);
+ xmlFree(content);
+
+ xmlUnlinkNode(next_child);
+ xmlFreeNode(next_child);
+ }
+ normalize_node(child);
+ }
+}
+
+// Clean the page of non-necessary whitespace. Leaves whitespace alone if and
+// only if xml:space="preserve" on the element. (IOW, it doesn't parse the DTDs,
+// nor the CSS.)
+void clean_node(xmlNodePtr node, bool preserve_whitespace, bool aggressive)
+{
+ if (node->type == XML_TEXT_NODE) {
+ std::string content = reinterpret_cast<const char *>(xmlNodeGetContent(node));
+ if (!preserve_whitespace) {
+ unsigned dstpos = 0;
+ for (unsigned srcpos = 0; srcpos < content.size(); ++srcpos, ++dstpos) {
+ if (content[srcpos] == '\n' ||
+ content[srcpos] == '\t' ||
+ content[srcpos] == ' ') {
+ content[dstpos] = ' ';
+
+ // compress double spaces
+ if (dstpos > 0 && content[dstpos - 1] == ' ') {
+ --dstpos;
+ }
+ } else {
+ content[dstpos] = content[srcpos];
+ }
+ }
+ content.resize(dstpos);
+ }
+ if (content.empty() || (aggressive && content == " ")) {
+ xmlUnlinkNode(node);
+ xmlFreeNode(node);
+ } else {
+ xmlNodeSetContentLen(node, reinterpret_cast<const xmlChar *>(content.data()), content.size());
+ }
+ } else {
+ if (node->type == XML_ELEMENT_NODE) {
+ xmlChar *space = xmlGetProp(node, reinterpret_cast<const xmlChar *>("xml:space"));
+ preserve_whitespace = (space != NULL && strcmp(reinterpret_cast<const char *>(space), "preserve") == 0);
+ }
+
+ xmlNode *next_child;
+ for (xmlNode *child = node->children; child != NULL; child = next_child) {
+ next_child = child->next;
+ clean_node(child, preserve_whitespace, aggressive);
+ }
+
+ if (node->type == XML_ELEMENT_NODE && node->children == NULL) {
+ std::string tag = reinterpret_cast<const char *>(node->name);
+
+ // These are the only elements allowed in XHTML to be EMPTY,
+ // so insert dummy nodes to prevent the output from using
+ // the <foo/> syntax where not appropriate.
+ if (tag != "base" && tag != "meta" && tag != "link" && tag != "hr" &&
+ tag != "br" && tag != "param" && tag != "img" && tag != "area" &&
+ tag != "input" && tag != "col") {
+ xmlNode *text = xmlNewText(reinterpret_cast<const xmlChar *>(""));
+ xmlAddChild(node, text);
+ }
+ }
+ }
+}
+
+} // namespace
+
+void XML_Template_clean_whitespace(XmlDocPtrWrapper doc, bool aggressive)
+{
+ normalize_node(xmlDocGetRootElement(doc->ptr));
+ clean_node(xmlDocGetRootElement(doc->ptr), false, aggressive);
+}
+