当前位置: 代码迷 >> ASP.NET >> 怎么提取一个页面中所有的文字(纯文字,排除图片 、链接、html 标签)
  详细解决方案

怎么提取一个页面中所有的文字(纯文字,排除图片 、链接、html 标签)

热度:1645   发布时间:2013-02-25 00:00:00.0
如何提取一个页面中所有的文字(纯文字,排除图片 、链接、html 标签)
thanks!


------解决方案--------------------------------------------------------
C# code
/// <summary>    /// 去掉html标记    /// </summary>    /// <param name="str"></param>    /// <returns></returns>    protected static string ConvertGettext(string str)    {        Regex regex = new Regex(@"\<(.*?)\>", RegexOptions.IgnoreCase);        return regex.Replace(str, "").Replace("&nbsp;", "").Replace("\n", "").Replace("\r", "");    }
------解决方案--------------------------------------------------------
C# code
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Net;using System.IO;using System.IO.Compression;using System.Text.RegularExpressions;namespace WikiPageCreater.Common{    public class PageHelper    {        /// <summary>        /// 根据 url 获取网页编码        /// </summary>        /// <param name="url"></param>        /// <returns></returns>        public static string GetEncoding(string url)        {            HttpWebRequest request = null;            HttpWebResponse response = null;            StreamReader reader = null;            try            {                request = (HttpWebRequest)WebRequest.Create(url);                request.Timeout = 20000;                request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)                {                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));                    else                        reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");                    if (reg_charset.IsMatch(html))                    {                        return reg_charset.Match(html).Groups["charset"].Value;                    }                    else if (response.CharacterSet != string.Empty)                    {                        return response.CharacterSet;                    }                    else                        return Encoding.Default.BodyName;                }            }            catch            {            }            finally            {                if (response != null)                {                    response.Close();                    response = null;                }                if (reader != null)                    reader.Close();                if (request != null)                    request = null;            }            return Encoding.Default.BodyName;        }        /// <summary>        /// 根据 url 和 encoding 获取当前url页面的 html 源代码               /// </summary>        /// <param name="url"></param>        /// <param name="encoding"></param>        /// <returns></returns>        public static string GetHtml(string url, Encoding encoding)        {            HttpWebRequest request = null;            HttpWebResponse response = null;            StreamReader reader = null;            try            {                request = (HttpWebRequest)WebRequest.Create(url);                request.Timeout = 20000;                request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)                {                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);                    else                        reader = new StreamReader(response.GetResponseStream(), encoding);                    string html = reader.ReadToEnd();                    return html;                }            }            catch            {            }            finally            {                if (response != null)                {                    response.Close();                    response = null;                }                if (reader != null)                    reader.Close();                if (request != null)                    request = null;            }            return string.Empty;        }    }}
  相关解决方案