perl解析html, 比较常用的模块是 HTML::TreeBuilder,该模块将html字符转化dom树,方便操作
一个dom元素对应 一个HTML::Element对象,dom的
属性方法都定义在该类中,以下是代码
?
# 一个dom元素,比较重要的属性,方法 # attr: $e->attr('id') 返回标签的某个属性值 # text: $e->as_text 返回标签包裹的内容,如<a>click me</a> 返回的是 click me # html: $e->as_HTML 返回该元素的html文本 # tagname: $e->tag() , 返回标签名,如a,div, 小写形式 # parent node: $e->parent , 返回父节点 # children node: $e->content_list() , 返回所有子节点,只是直接子节点 use HTML::TreeBuilder; @d=<DATA>; $html = join "",@d; #print get_elements_by_attr_regex($html, 'id', qr/^\d+$/)->as_HTML; # @elist = get_elements($html,'table','id',qr/\d+/); # print $elist[0]->as_HTML; $table = get_element_by_id($html, 'table1'); @children = $table->content_list(); foreach $child (@children){ print "child tag:",$child->tag(), "\n"; } #function defined........................ #$html: a html content #$attr: attribute name #$attr_regex: a pattern of attr value sub get_elements_by_attr_regex{ my ($html, $attr, $attr_regex) = @_; my $tree = new HTML::TreeBuilder; $tree->parse_content($html); my @list = $tree->look_down($attr, $attr_regex); return @list; } #$html: a html content #$idvalue: id value # sub get_element_by_id{ my ($html, $idvalue) =@_; my $tree = new HTML::TreeBuilder; $tree->parse_content($html); my @list = $tree->look_down('id',$idvalue); die "not unique element by id:$idvalue" if scalar(@list) != 1; return $list[0]; } #$html: a html content #$tagname: tag name # sub get_elements_by_tag_name{ my ($html, $tagname) =@_; my $tree = new HTML::TreeBuilder; $tree->parse_content($html); return $tree->find_by_tag_name($tagname); } #$html: a html string #$tag: tag name #$attr: attr name #$attr_regex: attr value pattern sub get_elements{ my ($html, $tag, $attr, $attr_regex) = @_; my @list = get_elements_by_attr_regex($html, $attr, $attr_regex); $tag = lc $tag; @list = grep $_->tag() eq $tag, @list; return @list; } __DATA__ <table id="table1" border="1" cellspacing="0" cellpadding="6"> <tr><td><a href="x">x text</a></td><td><a href="y">y</a></td></tr> <tr><td id='1s'>1</td><td >2</td></tr> </table>?
?
?
?
?
?