当前位置: 代码迷 >> HTML/CSS >> 用HtmlUnit跟httpClient抓施华洛世奇网站图片和动画<二>
  详细解决方案

用HtmlUnit跟httpClient抓施华洛世奇网站图片和动画<二>

热度:173   发布时间:2012-12-20 09:53:21.0
用HtmlUnit和httpClient抓施华洛世奇网站图片和动画<二>

try
????{
?????mainPage = webClient.getPage(url);
????} catch (Exception e)
????{
?????log.error(e.getMessage(), e);
????}
????if (mainPage != null)
????{
?????HtmlElement paginationContainer = mainPage
???????.getElementById("paginationContainer");
?????if (paginationContainer != null)
?????{
??????url = url + "/all-1";
??????try
??????{
???????mainPage = webClient.getPage(url);
??????} catch (Exception e)
??????{
???????log.error(e.getMessage(), e);
??????}
?????}
HtmlElement categories = mainPage.getElementById("categories");
?????if (categories == null)
?????{
??????HtmlElement products = mainPage.getElementById("products");
??????if(products!=null)
??????{
???????List<HtmlElement> productDivList = products.getElementsByTagName("div");
???????if(productDivList!=null&&productDivList.size()>0)
???????{
????????for(HtmlElement proDiv:productDivList)
????????{
?????????List<HtmlElement> subList = proDiv.getElementsByTagName("div");
?????????if(subList!=null)
?????????{
??????????for(HtmlElement dt:subList)
??????????{
???????????String classt = dt.getAttribute("class");
???????????if (classt != null && classt.equals("productImg"))
???????????{
????????????List<HtmlElement> subAList = proDiv.getElementsByTagName("a");
????????????if(subAList!=null&&subAList.size()>0)
????????????{
?????????????HtmlElement ae=subAList.get(0);
?????????????String href = ae.getAttribute("href");
?????????????thirdPageUrls.add(href);
????????????}
????????????break;
???????????}
??????????}
?????????}
????????}
???????}
??????}
?????}
?????else
?????{
????? List<HtmlElement> divList = categories.getElementsByTagName("div");
???????? if (divList != null && divList.size() > 0)
???????? {
???????for (HtmlElement div : divList)
???????{
????????List<HtmlElement> tempDivs = div
??????????.getElementsByTagName("div");
????????if (tempDivs != null && tempDivs.size() > 0)
????????{
?????????for (HtmlElement div1 : tempDivs)
?????????{
??????????List<HtmlElement> aList = div1
????????????.getElementsByTagName("a");
??????????HtmlAnchor a = (HtmlAnchor) aList
????????????.get(0);
??????????String link = a.getHrefAttribute();
??????????secondPageUrls.add(link);

?????????}

????????}
???????}
??????}
?????}
????

????}
???}
??}
??log.error("第二层抓取结束..........");
??log.error("目前抓取到的第二层URL个数为:"+secondPageUrls.size());

?

int count=0;
??
??for (String url : secondPageUrls)
??{
???count++;
???log.error("正在抓取第二层的第"+count+"个URL:"+url);
???HtmlPage mainPage = null;
???try
???{
????mainPage = webClient.getPage(url);
???} catch (Exception e)
???{
????log.error(e.getMessage(), e);
???}
???log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果的URL");
???if (mainPage != null)
???{
????HtmlElement paginationContainer = mainPage
??????.getElementById("paginationContainer");
????if (paginationContainer != null)
????{
?????url = url + "/all-1";
????}
????HtmlElement products = mainPage.getElementById("products");
????if (products != null)
????{
?????List<HtmlElement> list = products
???????.getElementsByTagName("div");
?????;
?????if (list == null || list.size() == 0)
?????{
??????continue;
?????}
?????for (HtmlElement h : list)
?????{
??????String cls = h.getAttribute("class");
??????if (cls == null || !cls.equals("productName"))
??????{
???????continue;
??????}
??????List<HtmlElement> links = h.getElementsByTagName("a");
??????if (links != null && links.size() > 0)
??????{
???????HtmlAnchor htmlAnchor = (HtmlAnchor) links.get(0);
???????String linkStr = htmlAnchor.getHrefAttribute();
???????thirdPageUrls.add(linkStr);
???????log.error(linkStr);
??????}

?????}
????}

???}
??}
??log.error("第二层抓取结束..........");
??secondPageUrls.clear();
??secondPageUrls = null;
??
?? count=0;
??log.error("目前抓取到的第三层URL个数为:"+thirdPageUrls.size());
??String urlPrix="
http://www.swarovski-crystallized.com/jewelry/us/";
?for (String url : thirdPageUrls)
??{
???count++;
???log.error("正在抓取第三层的第"+count+"个URL:"+url);
???HtmlPage mainPage = null;
???try
???{
????mainPage = webClient.getPage(url);
???} catch (Exception e)
???{
????log.error(e.getMessage(), e);
???}
???if (mainPage != null)
???{
????log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果");
????int indexC=url.indexOf(urlPrix);
????int indexD=url.indexOf("?");
????String dirStr=url.substring(indexC+urlPrix.length(), indexD);
????String regEx = "/";
//????Pattern p = Pattern.compile(regEx);
//????Matcher m = p.matcher(dirStr);
????

//哦哦哦,建立文件夹准备把抓到数据放在里面
????dirStr=replece( regEx,"\\\\",dirStr);
????dirStr = "D:\\swaroski\\"+dirStr;
????File?? file?? =?? new?? File(dirStr);??
????if(file.isDirectory())
????{
?????dirStr=dirStr+"\\"+count;
?????file?? =?? new?? File(dirStr);?
????}

?file.mkdirs();

Product product=new Product();
????product.setLocalDir(dirStr);
????
????product.setPageUrl(url);
????HtmlElement rightCol = mainPage.getElementById("rightCol");
????String title = null;
????String description = null;
????String packingUnit = null;
????if (rightCol != null)
????{
?????HtmlElement headlineDiv = rightCol
???????.getElementById("headline");
?????if (headlineDiv != null)

。。。。。。。。。。。。。。。。。。。。。。。。

log.error("完成:"+url+",分析结果");
?????try
?????{
??????swaroSkiDAO.addProduct(product);
?????} catch (Exception e)
?????{
??????log.error(e.getMessage(), e);
?????}
?????log.error("完成保存结果");
?????
?????for(String downloadUrl:resourceUrlList)
?????{
??????int index6=downloadUrl.lastIndexOf("/");
??????String fileName=downloadUrl.substring(index6+1);
??????String dirStr2=dirStr+"\\"+fileName;
???????? File storeFile = new File(fileName);
???????? if(storeFile.exists())
???????? {
????????? continue;
???????? }
??????SaveFileThread runable=new SaveFileThread(dirStr2,downloadUrl,sem);
??????pools.submit(runable);
??????log.error("开始提交下载文件:"+downloadUrl);
??????try
??????{
???????Thread.sleep(2500);
??????} catch (InterruptedException e)
??????{
??????}

}

?

?

//啊哈哈,这个就是去拿下图片和动画的线程,结束

class SaveFileThread implements Runnable
?{
??private String fileName;
??private String downloadUrl;
??private Semaphore sem;
????????? public SaveFileThread(String fileName,String downloadUrl,Semaphore sem)
????????? {
??????? ?? this.fileName=fileName;
??????? ?? this.downloadUrl=downloadUrl;
??????? ?? this.sem=sem;
????????? }
??public void run()
??{
??????????? HttpClient client = new HttpClient();?
?????????? GetMethod get = new GetMethod(downloadUrl);?
?????????? FileOutputStream output=null;
?????????? try
?????{
??????? client.executeMethod(get);
???????? File storeFile = new File(fileName);?
???????????? output = new FileOutputStream(storeFile);?
???????????? output.write(get.getResponseBody());
???????????? output.flush();
?????} catch (Exception e)
?????{
??????log.error(e.getMessage(), e);
?????}

  相关解决方案