FEAR::API FEAR::API - There's no fear with this elegant site scraper #24

Extraction

Extract data from CPAN

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;

Extract data from CPAN after some HTML cleanup

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print document->as_string;    # print content to STDOUT
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;

HTML cleanup, extract data, and refine results

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print $$_;    # print content to STDOUT
template("<!--item-->[% rec %]<!--end item-->");
extract;
postproc(q($_->{rec} =~ s/<.+?>//g));     # Strip HTML tags
print Dumper extresult;

Use filtering syntax

fetch("http://search.cpan.org/recent");
submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
            });
$_ | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
   | _template("<!--item-->[% rec %]<!--end item-->")
   | _result_filter(q($_->{rec} =~ s/<.+?>//g));
print Dumper \@$_;

Invoke handler for extracted results

fetch("http://search.cpan.org/recent");
submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
            });
$_ | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
   | "<!--item-->[% rec %]<!--end item-->"
   | _result_filter(q($_->{rec} =~ s/<.+?>//g));
invoke_handler('Data::Dumper');
Copyright © 2006 Yung-chung Lin