原文:http://www.kuomart.com/blog/read.php/242.htm
字符是一个很烦人而难以解决的问题,下面收集几个PHP对中文截取的代码:
1、按字数截取UTF8字符
/**
* @abstract 按字数截取UTF8字符
*/
function utf8_substr($str, $start, $length) {
if (function_exists('mb_substr')) {
return mb_substr($str, $start, $length, 'UTF-8');
}
preg_match_all("/./u", $str, $arr);
return implode("", array_slice($arr[0], $start, $length));
}
* @abstract 按字数截取UTF8字符
*/
function utf8_substr($str, $start, $length) {
if (function_exists('mb_substr')) {
return mb_substr($str, $start, $length, 'UTF-8');
}
preg_match_all("/./u", $str, $arr);
return implode("", array_slice($arr[0], $start, $length));
}
2、支持gb2312,gbk,utf-8,big5 中文截取
/*
* 中文截取,支持gb2312,gbk,utf-8,big5
*
* @param string $str 要截取的字串
* @param int $start 截取起始位置
* @param int $length 截取长度
* @param string $charset utf-8|gb2312|gbk|big5 编码
* @param $suffix 是否加尾缀
*/
public function csubstr($str, $start=0, $length, $charset="utf-8", $suffix=true)
{
if(function_exists("mb_substr"))
{
if(mb_strlen($str, $charset) <= $length) return $str;
$slice = mb_substr($str, $start, $length, $charset);
}
else
{
$re['utf-8'] = "/[/x01-/x7f]|[/xc2-/xdf][/x80-/xbf]|[/xe0-/xef][/x80-/xbf]{2}|[/xf0-/xff][/x80-/xbf]{3}/";
$re['gb2312'] = "/[/x01-/x7f]|[/xb0-/xf7][/xa0-/xfe]/";
$re['gbk'] = "/[/x01-/x7f]|[/x81-/xfe][/x40-/xfe]/";
$re['big5'] = "/[/x01-/x7f]|[/x81-/xfe]([/x40-/x7e]|/xa1-/xfe])/";
preg_match_all($re[$charset], $str, $match);
if(count($match[0]) <= $length) return $str;
$slice = join("",array_slice($match[0], $start, $length));
}
if($suffix) return $slice."…";
return $slice;
}
* 中文截取,支持gb2312,gbk,utf-8,big5
*
* @param string $str 要截取的字串
* @param int $start 截取起始位置
* @param int $length 截取长度
* @param string $charset utf-8|gb2312|gbk|big5 编码
* @param $suffix 是否加尾缀
*/
public function csubstr($str, $start=0, $length, $charset="utf-8", $suffix=true)
{
if(function_exists("mb_substr"))
{
if(mb_strlen($str, $charset) <= $length) return $str;
$slice = mb_substr($str, $start, $length, $charset);
}
else
{
$re['utf-8'] = "/[/x01-/x7f]|[/xc2-/xdf][/x80-/xbf]|[/xe0-/xef][/x80-/xbf]{2}|[/xf0-/xff][/x80-/xbf]{3}/";
$re['gb2312'] = "/[/x01-/x7f]|[/xb0-/xf7][/xa0-/xfe]/";
$re['gbk'] = "/[/x01-/x7f]|[/x81-/xfe][/x40-/xfe]/";
$re['big5'] = "/[/x01-/x7f]|[/x81-/xfe]([/x40-/x7e]|/xa1-/xfe])/";
preg_match_all($re[$charset], $str, $match);
if(count($match[0]) <= $length) return $str;
$slice = join("",array_slice($match[0], $start, $length));
}
if($suffix) return $slice."…";
return $slice;
}
这两个是比较不错的函数,下面这个以类的思想来做,仅供参考,不做多评...代码如下:
3、类实现方法
<?php
class splite_utf8
{
private function splite_single_utf8_left_word ($str )
{
$aciss = ord( $str);
$out_str = '';
if ($aciss >= 240 )
{
$out_str.=substr ( $str, 0, 4 );
}
elseif ($aciss >= 224 )
{
$out_str.=substr ( $str, 0, 3 );
}
elseif ($aciss >= 192 )
{
$out_str.=substr ( $str, 0, 2 );
}
else
{
$out_str.=substr ($str, 0, 1 );
}
return $out_str;
}
private function splite_single_utf8_right_word ($str )
{
$aciss = ord( $str);
$out_str = '';
if ($aciss >= 240 )
{
$out_str.=substr ( $str, 4 );
}
elseif ($aciss >= 224 )
{
$out_str.= substr ( $str, 3 );
}
elseif ($aciss >= 192 )
{
$out_str.= substr ( $str, 2 );
}
else
{
$out_str.= substr ($str, 1 );
}
return $out_str;
}
public function count_word($str, $length=0 )
{
$aciss = ord( $str);
if ($aciss >= 240 )
{
$length+= 1;
$str=substr($str,4);
}
elseif ($aciss >= 224 )
{
$length+= 1;
$str=substr($str,3);
}
elseif ($aciss >= 192 )
{
$length+= 1;
$str=substr($str,2);
}
else
{
$length+= 1;
$str=substr($str,1);
}
if($str=='')
{
return $length;
}
else
{
return $this->count_word($str,$length);
}
}
public function splite_mulit_utf8_word ($str, $start = 0, $length = -1 )
{
$temp = '';
if($start < 0 )
{
$start = $this->count_word($str) + $start;
}
for ($i = 0; $i < $start; $i++ )
{
$str=$this->splite_single_utf8_right_word ($str );
}
for ($i = 0; $i < $length; $i++ )
{
$temp.= $this->splite_single_utf8_left_word ($str );
$str = $this->splite_single_utf8_right_word ($str );
}
if( $length == -1 )
{
return $str;
}
else
{
return $temp;
}
}
}
$utf=new splite_utf8();
$text='的萨芬dfdf!@#$%^&*I()';
$length=$utf->count_word($text);
echo $length."/n";
$word=$utf->splite_mulit_utf8_word ($text, -6, 2);
var_dump($word);
?>
class splite_utf8
{
private function splite_single_utf8_left_word ($str )
{
$aciss = ord( $str);
$out_str = '';
if ($aciss >= 240 )
{
$out_str.=substr ( $str, 0, 4 );
}
elseif ($aciss >= 224 )
{
$out_str.=substr ( $str, 0, 3 );
}
elseif ($aciss >= 192 )
{
$out_str.=substr ( $str, 0, 2 );
}
else
{
$out_str.=substr ($str, 0, 1 );
}
return $out_str;
}
private function splite_single_utf8_right_word ($str )
{
$aciss = ord( $str);
$out_str = '';
if ($aciss >= 240 )
{
$out_str.=substr ( $str, 4 );
}
elseif ($aciss >= 224 )
{
$out_str.= substr ( $str, 3 );
}
elseif ($aciss >= 192 )
{
$out_str.= substr ( $str, 2 );
}
else
{
$out_str.= substr ($str, 1 );
}
return $out_str;
}
public function count_word($str, $length=0 )
{
$aciss = ord( $str);
if ($aciss >= 240 )
{
$length+= 1;
$str=substr($str,4);
}
elseif ($aciss >= 224 )
{
$length+= 1;
$str=substr($str,3);
}
elseif ($aciss >= 192 )
{
$length+= 1;
$str=substr($str,2);
}
else
{
$length+= 1;
$str=substr($str,1);
}
if($str=='')
{
return $length;
}
else
{
return $this->count_word($str,$length);
}
}
public function splite_mulit_utf8_word ($str, $start = 0, $length = -1 )
{
$temp = '';
if($start < 0 )
{
$start = $this->count_word($str) + $start;
}
for ($i = 0; $i < $start; $i++ )
{
$str=$this->splite_single_utf8_right_word ($str );
}
for ($i = 0; $i < $length; $i++ )
{
$temp.= $this->splite_single_utf8_left_word ($str );
$str = $this->splite_single_utf8_right_word ($str );
}
if( $length == -1 )
{
return $str;
}
else
{
return $temp;
}
}
}
$utf=new splite_utf8();
$text='的萨芬dfdf!@#$%^&*I()';
$length=$utf->count_word($text);
echo $length."/n";
$word=$utf->splite_mulit_utf8_word ($text, -6, 2);
var_dump($word);
?>