HTML::TreeBuilder has mojibake problem, it shows wired chars in the output
        Posted  
        
            by varun_vijay_r
        on Stack Overflow
        
        See other posts from Stack Overflow
        
            or by varun_vijay_r
        
        
        
        Published on 2010-06-09T14:07:22Z
        Indexed on 
            2010/06/09
            14:12 UTC
        
        
        Read the original article
        Hit count: 326
        
   use strict;
   use WWW::Curl::Easy;
   use HTML::TreeBuilder;
   my $cookie_file ='/tmp/pcook';
   my $curl = new WWW::Curl::Easy;
   my $response_body;
   my $charset = 'utf-8';
   $DocOffline::charset = undef;
   $curl->setopt (CURLOPT_URL, 'http://www.breitbart.com/article.php?id=D9G7CR5O0&show_article=1');
   $curl->setopt ( CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.9 (KHTML, like Gecko) Chrome/6.0.400.0 Safari/533.9');
   $curl->setopt ( CURLOPT_HEADER, 0);
   $curl->setopt ( CURLOPT_FOLLOWLOCATION, 1);
   $curl->setopt ( CURLOPT_AUTOREFERER, 1);
   $curl->setopt ( CURLOPT_SSL_VERIFYPEER, 0);
   $curl->setopt ( CURLOPT_COOKIEFILE, $cookie_file);
   $curl->setopt ( CURLOPT_COOKIEJAR, $cookie_file);
   $curl->setopt ( CURLOPT_REFERER, 'http://www.iavian.com/docOff/');
   $curl->setopt ( CURLOPT_HEADERFUNCTION, \&headerCallback );
   open (my $fileb, ">", \$response_body);
   $curl->setopt(CURLOPT_WRITEDATA,$fileb);
   my $retcode = $curl->perform;
   if ($retcode == 0) {
   my $dom_tree = HTML::TreeBuilder->new();
   $dom_tree->ignore_elements(qw(script style));
   $dom_tree->utf8_mode(1);
   $dom_tree->parse($response_body);
   $dom_tree->eof();
   print $dom_tree->as_HTML('<>&', ' ', {});
} else { print("An error happened: ".$curl->strerror($retcode)." ($retcode)\n"); }
sub headerCallback { my($data, $pointer) = @_; $data =~ m/Content-Type:\s*.*;\s*charset=(.*)/; if ($1) { $charset = $1; $charset =~ s/[^a-zA-Z0-9_-]*//g; } return length($data); }
© Stack Overflow or respective owner