Sunday, January 2, 2011

MSXML validation

#!perl
# working test script to explore MSXML usage.
##################################################
use strict;
use Win32::OLE;
Win32::OLE->Option(Warn => 2);
use XML::XPath;
$| = 1;


my $contentType;
my $inputXML = $ARGV[0];
if ($inputXML =~ /OVAL/) { $contentType = 'OVAL'; }
# Verify that the input XML file exists.
unless (-e $inputXML) { print  "File $inputXML does not exist."; exit; }

# Load the root node.
my $nodes = XML::XPath->new($inputXML);
my $nodeSet = $nodes->find("/");

# Ensure that we only find one node.
if ($nodeSet->size < 1) {
    print  "Root node not found during validation of $inputXML.";
    exit;
    }
elsif ($nodeSet->size > 1) {
    print  "More than one root node found during validation of $inputXML.";
    exit;
    }
print "Root node exists\n";
# Get the root node.
my $rootNode = $nodeSet->get_node(1);

# Get the node as a string.
my $rootNodeString = $rootNode->toString(1);

my $schemaLocations;
my %schemaLocations;
if ($rootNodeString =~ /schemaLocation\s*=\s*"(.*?)"/i) { $schemaLocations = $1; }
else { print  "Could not find schemaLocation in $inputXML."; exit; }
print "schemaLocation found.\n";
# print "$schemaLocations\n";

#                        '!spaces' then 'space(s)' then '!spaces' then 'space(s)'
my $num_schemata;
while ($schemaLocations =~ /\s*([^\s]+?)\s+([^\s]+)\s*/g) { $schemaLocations{$1} = $2; $num_schemata++; }

if ($num_schemata > 0) { print "Successfully built schema locations hash.\n"; }
else { die "SchemaLocations hash is empty.\n"; }

# Verify that all of the files exist in our schema directory.
foreach my $uri (keys(%schemaLocations) )
{
    my $sl = $schemaLocations{$uri};
    $sl =~ s/.*[\\\/](.*$)/$1/gis;

    if ($contentType eq 'OVAL') { $sl = '../Schemas/oval_5.7/'. $sl; }
    else { $sl = './Schemas/'. $sl; }

    if (!(-e $sl)) { print  "Could not find schema " . $sl. ".\n"; }
    else { $schemaLocations{$uri} = $sl; } # Now, points to local copy of the schema
}

# Create a schema cache. MSXML 6.0 must be installed, 3.0 will not work.
my $schemaCache = Win32::OLE->new("MSXML2.XMLSchemaCache.6.0");
unless ($schemaCache) { die "MSXML 6.0 is not installed."; }
print "MSXML 6.0 is installed and schemaCache created.\n";

# Set properties of the schema cache.
$schemaCache->{async} = "false";
$schemaCache->{validateOnParse} = "1";
$schemaCache->{validateOnLoad} = "1";

# Load each schema into the cache.
foreach my $uri (keys(%schemaLocations)) { $schemaCache->add($uri, $schemaLocations{$uri});  }
$schemaCache->add('garbage');


# Create a DOM object. MSXML 6.0 must be installed, 3.0 will not work.
my $xmlDoc = Win32::OLE->new('MSXML2.DOMDocument.6.0');
unless ($xmlDoc) { die "MSXML 6.0 is not installed."; }

print "Setting DOM object properties.\n";
$xmlDoc->{async} = "false";
$xmlDoc->{validateOnParse} = "true";
$xmlDoc->{schemas} = $schemaCache;

# All required schema references have been addressed.
# Remove 'schemaLocation from root element to absolutely prevent
# any possible redundant use by MSXML libs
my $xmlString;
open(XML,"<$inputXML") or die "Failed to open XML file, $!\n";
while () { $xmlString .= $_; }
close XML;
if ($xmlString =~ s/xsi:schemaLocation\s*=\s*".*?"\s*//is) { print "Nulled out schemaLocation.\n"; }
else { print "Failed to null out schemaLocation\n"; }

print "Validating the input XML file against the cached schemas.\n";
unless ($xmlDoc->LoadXML("$xmlString"))
{
    my $Rs = $xmlDoc->{parseError}->{reason};
    my $Ln = $xmlDoc->{parseError}->{line};
    my $Ps = $xmlDoc->{parseError}->{linePos};
    my $Tx = $xmlDoc->{parseError}->{srcText};
    print "Did not load at line $Ln, pos $Ps, reason: $Rs, text: $Tx\n";

    my $parseError = $xmlDoc->{parseError};
    exit;
}

print "XML load successful, XML file is validated.\n";

No comments: