澳门新浦京8455com提取HTML标签

?php
/********************************* *
*作者: 徐祖宁 (唠叨) *邮箱: czjsz_ah@stats.gov.cn *开发: 2002.07 *
* *函数: tags *功能: 从文件中提取HTML标签 * *入口: *$filename
文件名 *$tag标签名 *返回: *数组,每项为: *tagNameString *TextString
*AttrsArray * *示例: *print_r(tags(“test1.htm”,”a”));
*print_r(“”,”img”); * */ function tags($filename,$tag) { $buffer =
join(“”,file($filename)); $buffer = eregi_replace(“”,””,$buffer);
$tagkey = sql_regcase($tag); $buffer = eregi_replace(“$tagkey “,”$tag
“,$buffer); $ar = split(“”,$buffer); foreach($ar as $v) { if(!
eregi(“$tagkey “,$v)) continue; eregi(“$tagkey
([^]*)((.*)/$tagkey)?”,$v,$regs); $p[tagName] = strtoupper($tag);
if($regs[3]) $p[Text] = $regs[3]; $s = trim(eregi_replace(“[
]+”,” “,$regs[1])).” “; $s = eregi_replace(” *= *”,”=”,$s); $a =
split(” “,$s); for($i=0;$icount($a);$i++) { $ch = array();
if(eregi(“=[“]”,$a[$i])) { $j = $i+1;
while(!eregi(“[“]$”,$a[$i])) { $a[$i] .= ” “.$a[$j];
unset($a[$j]); } } } foreach($a as $k) { $name =
strtoupper(strtok($k,”=”)); $value = strtok(“

        return $resultstr;
    }
   
}

$str =
“a1<body>b2<p>c3<em>d4</em>e5</p>f6</body>g7h8”;

echo “算法运行时间(microtime):”,(microtime(true)-$sttime),'<br/>’;

 

//方法三

            $tag = array_pop($tagstack);

 代码如下

            $resultstr .= $str[$i];

function mysubstr( $str, $length ){
   
    $tagcnt = 0;
    $charcnt = 0;
    $tag = ”;
    $maxlen = strlen( $str );
    $resultstr = ”;
    $tagstack = array();

    //栈是空的直接返回
    if(empty($tagstack)){
        return $resultstr;
    }
    //否则去掉没有结束标记的开始标记
    else{
       
        while(!empty($tagstack)){

**
 * 函数名 html_substr
 * 功能 从html串中截取指定长度的字串,html标记不计算在内
 * 参数
 *  $str 要截取的串
 *  $len 要截取的长度
 *  $mode 不匹配的标记的处理方式 0 删去(默认),1 补齐
 * 返回 截取到的串
 * 说明
 *  未考虑多字节字符,仅已字节做计数单位
 *  未考虑可单独存在的标记
 **/
function html_substr($str, $len, $mode=0) {
  $ar= preg_split(‘/(<!–.*–>|<[^>]*>)/s’, $str,
-1, preg_split_delim_capture);
  foreach($ar as $k => $v) {
    if($v{0} != ‘<‘) {
      $len = $len – strlen($v);
      if($len < 0) $ar[$k] = substr($v, 0, $len);
    }else $ar[$k] = strtolower($v);
    if($len <= 0) break;
  }
  $ar = array_slice($ar, 0, $k+1);
  $len = count($ar);
  foreach($ar as $k=>$v) {
    if($v{0} == ‘<‘ && $v[1] != ‘/’) {
      $ch = str_replace(‘<‘, ‘</’, $v);
      for($i=$k+1; $i<$len && $ar[$i]!=$ch; $i++);
      if($i == $len)
        if($mode)
          $ar[$len] = $ch . $ar[$len];
        else
          $ar[$k] = ”;
    }
  }
  return join(”, $ar);
}
$str = “123<em>abc</em>456<em>def</em>789”;

 代码如下

        $charcnt++;
        $resultstr .= $str[$i];
    }

 

   
    echo ‘<hr size=1>最后结果为:’;

            $resultstr[$i++] = ”;
       
        }

            $index = strrpos($resultstr, $tag);

$sttime = microtime(true);

echo “内存使用情况:”,(memory_get_usage()-$stmem),'<br />’;

            for($i = $index-1; $resultstr[$i] != ‘>’; $i++ ){
                $resultstr[$i] = ”;
            }

$stmem = memory_get_usage();

echo htmlspecialchars($s);

    for( $i = 0; $i < $length; $i++ ){
        if( $str[$i] == ‘<‘ ){

            for( $j=$i; $str[$j]!=’>’; $j++,$length++ ){
                $tag .= $str[$j];
            }
            $tagcnt++;
            $length++;
            $tag .= ‘>’;
           
            //如果是开始标记,则入栈,如果是与之相对应的结束标记则出栈
            if( preg_match(‘/<([^/]+)?>/i’, $tag, $r) ){
                echo ‘入栈:’,htmlspecialchars($r[1]),'<br />’;
                array_push($tagstack, $r[1]);
            }
            elseif( preg_match(
‘/’.$tagstack[count($tagstack)-1].’/’, $tag ) ){
                echo
‘出栈:’,htmlspecialchars($tagstack[count($tagstack)-1]),'<br
/>’;
                array_pop( $tagstack );
            }

 代码如下

//方法二

echo ‘<xmp>’;
echo html_substr($str, 5) . php_eol;
echo html_substr($str, 5, 1);

 

/

            $tag = ”;
            continue;
        }

echo ‘处理结果为:<br/><hr size=1>’,htmlspecialchars(
mysubstr( $str, 18 ) ),'<br />’;

$str =
“a1<body>b2c3<p><em>d4</em>e</p>5f6</body>g7h8”;
$gn  = 7;
$i   = $j = $k = 0;
while( ($c = $str[$i++]) && $j < $gn )
{
    if( $c == ‘<‘)
    {
        $tag = 1;
    }
    elseif($c == ‘>’)
    {
        if(trim($tg,’/’) == ’em’)
        {
            $tgs[$j-1] = ‘<‘.$tg.’>’;
        }
        else
        {
            if($tgs[$j-1]) $ogs[$j-1] = ‘1|’.'<‘.$tg.’>’;
            else $ogs[$j-1]           = ‘0|’.'<‘.$tg.’>’;
        }
        $tag = 0;
        $tg  = ”;
    }
    elseif($tag == 1)
    {
        $tg .= $c;
    }
    else
    {
        $tmp[] = $c;
        $j++;
    }
}
$ts = count($tgs);
if($ts % 2) array_pop($tgs);
foreach($tmp as $k=>$v)
{
   $ba = explode(‘|’,$ogs[$k],2);
   if( $tgs[$k] && $ogs[$k])
   {
        if($ba[0])
        {
            $s .= $v.$tgs[$k].$ba[1];
        }   
        else $s .= $v.$ba[1].$tgs[$k];
   }
   else $s .= $v.$tgs[$k].$ba[1];
}

You can leave a response, or trackback from your own site.

Leave a Reply

网站地图xml地图