当前位置: 代码迷 >> C# >> 点滴累积【C#】-抓取页面中想要的数据
  详细解决方案

点滴累积【C#】-抓取页面中想要的数据

热度:287   发布时间:2016-04-28 08:26:57.0
点滴积累【C#】---抓取页面中想要的数据

效果

描述:此功能是抓取外国的一个检测PM2.5的网站。实时读取网站的数据,然后保存到数据库里面。每隔一小时刷新一次。

地址为:http://beijing.usembassy-china.org.cn/070109air.html

筛选后的地址为:http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&

思路:先抓取到页面的所有数据,保存到txt里面,再一行一行的读取txt,然后用split,substring截取到自己想要的数据,最后保存到数据库,在进行插入数据库的时候查看一下是否已经存在,如果不存在则插入。

代码

using System;using System.Collections.Generic;using System.Configuration;using System.Data;using System.Data.SqlClient;using System.IO;//using System.Linq;using System.Net;using System.Text;using System.Text.RegularExpressions;//using System.Threading.Tasks;/******************************** * 创建人:青苹果 * 创建时间:2015-12-28 * 描述:获取美利坚合众国的 PM2.5 * ******************************/namespace GetUSAData{    class Program    {        //public static string GetURL = System.Configuration.ConfigurationSettings.AppSettings["GetURL"];//获取数据的地址        public static string GetURL = "http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&amp;desc=1&amp;num=7&amp;targ=y&amp;utf=y&amp;pc=y&amp;words=40&amp;";        public static string txtURL = System.Configuration.ConfigurationSettings.AppSettings["txtURL"];//保存为txt文件的路径        public static string conn = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();        static void Main(string[] args)        {            LoadGO();        }        public static void LoadGO()        {            GetUSA();            List<string[]> getlist = Read(txtURL);            //删除txt            if (File.Exists(txtURL))            {                //如果存在则删除                File.Delete(txtURL);            }            if (getlist.Count > 0)            {                for (int i = getlist.Count-1; i >-1; i--)                {                    DateTime dtime = DateTime.Parse(getlist[i][0].ToString());                    string getTime = dtime.ToString("yyyy-MM-dd HH:mm");                    string controlTime = dtime.ToString("yyyy-MM-dd");                    float LatestHourdata1 = float.Parse(getlist[i][2]);                    int LatestHourdata2 = Convert.ToInt32(getlist[i][3]);                    float Avgdata1 = 0;                    int Avgdata2 = 0;                    string Avgdata3 = getlist[i][4].ToString();                    List<SqlParameter> listWhere = new List<SqlParameter>();                    listWhere.Add(new SqlParameter("@strDatetime", controlTime));                    string sqlSelect = @"SELECT count(*) as allcount,sum(LatestHourdata1) as LatestHourdata1_avg, sum(LatestHourdata2) as LatestHourdata2_avg FROM T_twitter  where ([LatestHourdata1] is not null or [LatestHourdata2] is not null or [Avgdata1] is not null  or [AvgData2] is not null) and   CONVERT(varchar(100), [datetime], 23)[email protected]";                    DataTable sumDT = ControlDB(sqlSelect, listWhere, "select");    //查询总和用于计算日均值                    if (sumDT.Rows.Count > 0)                    {                        foreach (DataRow itemDR in sumDT.Rows)                        {                            int allcount = Convert.ToInt32(itemDR["allcount"].ToString());    //数据库中当前日期数量总和                            if (allcount > 0)                            {                                if (itemDR["LatestHourdata1_avg"] != null)                                {                                    Avgdata1 = float.Parse(itemDR["LatestHourdata1_avg"].ToString());   //数据库中LatestHourdata1_avg总和                                    Avgdata1 = (Avgdata1 + LatestHourdata1) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值                                }                                if (itemDR["LatestHourdata2_avg"] != null)                                {                                    Avgdata2 = Convert.ToInt32(itemDR["LatestHourdata2_avg"].ToString());   //数据库中LatestHourdata2_avg总和                                    Avgdata2 = (Avgdata2 + LatestHourdata2) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值                                }                                //根据网站规则判断PM2.5的平均严重性                                if (Avgdata2 >= 0 && Avgdata2 <= 50)                                {                                    Avgdata3 = " Good (at 24-hour exposure at this level)";                                }                                else if (Avgdata2 >= 51 && Avgdata2 <= 100)                                {                                    Avgdata3 = " Moderate (at 24-hour exposure at this level)";                                }                                else if (Avgdata2 >= 101 && Avgdata2 <= 150)                                {                                    Avgdata3 = " Unhealthy for Sensitive Groups (at 24-hour exposure at this level)";                                }                                else if (Avgdata2 >= 151 && Avgdata2 <= 200)                                {                                    Avgdata3 = " Unhealthy (at 24-hour exposure at this level)";                                }                                else if (Avgdata2 >= 201 && Avgdata2 <= 300)                                {                                    Avgdata3 = " Very Unhealthy (at 24-hour exposure at this level)";                                }                                else                                {                                    Avgdata3 = " Hazardous (at 24-hour exposure at this level)";                                }                            }                            else                            {                                Avgdata1 = LatestHourdata1;                                Avgdata2 = LatestHourdata2;                            }                        }                    }                    List<SqlParameter> pars = new List<SqlParameter>();                    pars.Add(new SqlParameter("@whereDatetime", getTime));                    pars.Add(new SqlParameter("@datetime", getTime));                    pars.Add(new SqlParameter("@LatestHourdata1", LatestHourdata1));                    pars.Add(new SqlParameter("@LatestHourdata2", LatestHourdata2));                    pars.Add(new SqlParameter("@LatestHourdata3", getlist[i][4].ToString()));                    pars.Add(new SqlParameter("@Avgdata1", Avgdata1));                    pars.Add(new SqlParameter("@Avgdata2", Avgdata2));                    pars.Add(new SqlParameter("@Avgdata3", Avgdata3));                    string sql = @"if not exists(select * from  dbo.T_twitter where  [email protected]) begininsert T_twitter (datetime,LatestHourdata1,LatestHourdata2,LatestHourdata3,Avgdata1,AvgData2,AvgData3)VALUES(@datetime,@LatestHourdata1,@LatestHourdata2,@LatestHourdata3,@Avgdata1,@Avgdata2,@Avgdata3) end";                    ControlDB(sql, pars, "");//插入数据                }            }        }        /// <summary>        /// 获取页面数据保存至txt        /// </summary>        public static void GetUSA()        {            WebRequest request = WebRequest.Create(GetURL);            WebResponse response = request.GetResponse();            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));            //reader.ReadToEnd() 表示取得网页的源码            FileStream fs = new FileStream(txtURL, FileMode.Create);            byte[] data = System.Text.Encoding.Default.GetBytes(reader.ReadToEnd());            //开始写入            fs.Write(data, 0, data.Length);            //清空缓冲区、关闭流            fs.Flush();            fs.Close();        }        /// <summary>        /// 根据路径读取txt文件        /// </summary>        /// <param name="path">txt路径</param>        /// <returns></returns>        public static List<string[]> Read(string path)        {            List<string[]> list = new List<string[]>();            StreamReader sr = new StreamReader(path, Encoding.Default);            String line;            while ((line = sr.ReadLine()) != null)            {                int i = line.ToString().IndexOf("title");                if (i > 0)                {                    string titleStr = line.ToString().Substring(i + 7); //截取到title后面的值                    string[] titlelist = titleStr.Split('"');        //以"  截取                    string titledata = titlelist[0];                    string[] datalist = titledata.Split('&');  //以& 截取                    string data = datalist[0];                    string[] datastrlist = data.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);//以; 截取                    list.Add(datastrlist);                }            }            sr.Close();            return list;        }        /// <summary>        /// 增查表        /// </summary>        /// <returns></returns>        public static DataTable ControlDB(string sql, List<SqlParameter> par, string type)        {            DataAccess controData = new DataAccess();            DataTable resultDT = new DataTable();            if (type == "select")            {                resultDT = controData.GetDataTable(sql, par.ToArray());            }            else            {                int result = controData.ExecuteSql(sql, par.ToArray());            }            return resultDT;        }    }}

 Demo下载:

 http://files.cnblogs.com/files/xinchun/GetUSAData.zip

  相关解决方案