From 0202ee945a19bcb6e497f0200f096de16db845f4 Mon Sep 17 00:00:00 2001 From: dakkar Date: Sun, 5 Feb 2006 13:45:32 +0000 Subject: step per parsare ReST in un dom XML git-svn-id: svn://luxion/repos/WebCoso/trunk@154 fcb26f47-9200-0410-b104-b98ab5b095f3 --- lib/WebCoso/Step/ReST/SplitLang.pm | 2 + lib/WebCoso/Step/ReST/ToXml.pm | 84 ++++++++++++++++++++++++++++++++ t/steps/rest-xml.t | 99 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 lib/WebCoso/Step/ReST/ToXml.pm create mode 100644 t/steps/rest-xml.t diff --git a/lib/WebCoso/Step/ReST/SplitLang.pm b/lib/WebCoso/Step/ReST/SplitLang.pm index a24e55e..dc60757 100644 --- a/lib/WebCoso/Step/ReST/SplitLang.pm +++ b/lib/WebCoso/Step/ReST/SplitLang.pm @@ -35,6 +35,8 @@ sub process { $fh=$resource->get_property({filename=>$filename},'datastream'); } + binmode $fh,':utf8'; + # raccolgo le lingue usate my %langs=(''=>undef); seek $fh,0,0; diff --git a/lib/WebCoso/Step/ReST/ToXml.pm b/lib/WebCoso/Step/ReST/ToXml.pm new file mode 100644 index 0000000..2f9384a --- /dev/null +++ b/lib/WebCoso/Step/ReST/ToXml.pm @@ -0,0 +1,84 @@ +package WebCoso::Step::ReST::ToXml; +use strict; +use warnings; +use base 'WebCoso::Step'; +use Class::Std; +use Inline 'Python'; +use XML::LibXML; +use Encode; + +{ + +=head2 Che fa + +prende i {language=>'quelchece'}->rstdoc, li passa a docutils, prende +l'xml, lo passa al parser, e salva il dom in +{language=>'quelchece'}->xmldom + +suppone venga tutto da un solo file + +Fa tutto alla prima passata + +=cut + +my $xml_parser=XML::LibXML->new(); +$xml_parser->load_ext_dtd(0); +$xml_parser->clean_namespaces(1); + +sub process { + my ($self,$resource,$stage)=@_; + + return unless $stage eq 'meta'; + + my ($src_path)=$resource->get_axis_values('filename'); + + my ($rst_doc,$xml_dom); + $rst_doc=$resource->get_property('rstdoc'); + if (defined $rst_doc) { # monolingua + $resource->set_property( + 'xmldom', + rst2xml($rst_doc,$src_path) + ); + } + else { # multilingua + my @langs=$resource->get_axis_values('language'); + for my $cur_lang (@langs) { + $rst_doc=$resource->get_property({language=>$cur_lang},'rstdoc'); + $resource->set_property( + {language=>$cur_lang}, + 'xmldom', + rst2xml($rst_doc,$src_path,$cur_lang) + ); + } + } + + return; +} + +sub rst2xml { + my ($rst_string,$source_path,$language)=@_; + + $rst_string=Encode::encode('utf-8',$rst_string); + my $xml_string=_rst2xml($rst_string,$source_path,$language||'it'); + $xml_parser->base_uri($source_path); + return $xml_parser->parse_string($xml_string); +} + +} +1; +__DATA__ +__Python__ + +import locale +import docutils.core + +def _rst2xml(source,source_path,language): + return docutils.core.publish_string( + source,source_path=source_path, + writer_name='xml', + settings_overrides={ + 'input_encoding':'utf-8', + 'output_encoding':'utf-8', + 'language_code':language, + }, + ) diff --git a/t/steps/rest-xml.t b/t/steps/rest-xml.t new file mode 100644 index 0000000..6ee8fa2 --- /dev/null +++ b/t/steps/rest-xml.t @@ -0,0 +1,99 @@ +#!/usr/bin/perl +use utf8; +use strict; +use warnings; +use Path::Class; +use Test::More 'no_plan'; +use Test::Differences; +use Encode; +use WebCoso::Resource; + +BEGIN {use_ok('WebCoso::Step::ReST::ToXml')} +my $step=WebCoso::Step::ReST::ToXml->new(); + +sub make_res { + my $resource=WebCoso::Resource->new(); + # serve solo per il path + $resource->set_property({filename=>'/tmp/mydoc.rest.txt'},datastream=>undef); + if (@_==1) { # monolingua + $resource->set_property(rstdoc=>$_[0]); + } + else { # multilingua + my %rst_doc=@_; + while (my ($lang,$str)=each %rst_doc) { + $resource->set_property( + {language=>$lang}, + rstdoc=>$str + ); + } + } + + return $resource; +} + +my $resource=make_res(<<'END_REST'); +Documento +========= + +paragrafo àè +END_REST + +$step->process($resource,'gen'); +ok(!defined $resource->get_property('xmldom'),'no action on second pass'); + +$step->process($resource,'meta'); +my $dom=$resource->get_property('xmldom'); +isa_ok($dom,'XML::LibXML::Document','parsed ok on first pass'); + +is($dom->findvalue('/document/@source'), + '/tmp/mydoc.rest.txt', + 'source path'); +is($dom->findvalue('/document/title'), + 'Documento', + 'title'); +is($dom->findvalue('count(/document/paragraph)'), + 1, + '1 paragraph'); +is($dom->findvalue('/document/paragraph'), + 'paragrafo àè', + 'paragraph content (unicode)'); + +$resource=make_res(it=><<'END_REST_IT',en=><<'END_REST_EN'); +Documento +========= + +paragrafo +END_REST_IT +Document +======== + +paragraph +END_REST_EN + +$step->process($resource,'meta'); +$dom=$resource->get_property({language=>'it'},'xmldom'); +isa_ok($dom,'XML::LibXML::Document','parsed it'); + +is($dom->findvalue('/document/@source'), + '/tmp/mydoc.rest.txt', + 'source path'); +is($dom->findvalue('/document/title'), + 'Documento', + 'title'); +is($dom->findvalue('/document/paragraph'), + 'paragrafo', + 'paragraph content'); + +$dom=$resource->get_property({language=>'en'},'xmldom'); +isa_ok($dom,'XML::LibXML::Document','parsed en'); + +is($dom->findvalue('/document/@source'), + '/tmp/mydoc.rest.txt', + 'source path'); +is($dom->findvalue('/document/title'), + 'Document', + 'title'); +is($dom->findvalue('/document/paragraph'), + 'paragraph', + 'paragraph content'); + -- cgit v1.2.3