问题描述
我已经在phantomJs中创建了一个脚本。 它的作用是,它从特定的页面中获取了一些正常工作的元素。
这是代码:
var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;
page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished = function() { loadInProgress = false; console.log("load finished"); };
// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';
// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;
var steps = [
function() { //Load Page
page.open("http://www.example.com/mobiles/");
page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
},
function() { //Fetch Products
page.onCallback = function(result) {
var fs = require('fs');
fs.write('product-list.csv', result, 'w+');
};
page.evaluate(function() {
var arr_mainList = new Array();
var arr_innerList = new Array();
try {
for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);
window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
//window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");
var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
myWindow.close();
if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
console.log(innerURL);
}
window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");
arr_innerList.push(arr_mainList[i]);
for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {
if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
}
else {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
}
};
//window.callPhantom(", ");
window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
window.callPhantom("\n");
};
loadInProgress = true;
console.log("Successful.");
}
catch(ex) {
console.log("Failed: " + ex);
}
});
}
];
interval = setInterval(function() {
if (!loadInProgress && typeof steps[testindex] == "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
setTimeout(function() {
//fs.write('product-list.html', page.content, 'w');
console.log("test complete!");
phantom.exit();
}, 100);
}
}, 5000);
现在,如果我运行该程序,我将在csv文件中获取所有信息。 除非转到window.open,否则phantomJs都会停止。 我知道我无法在page.evaluate内部打开新页面。 但是我需要获取产品描述,并将其添加到csv文件中,以代替产品链接。 我已经搜索了几个小时,任何帮助都会很好。 注意:我的限制是我必须使用phantomJs。
1楼
Mansoor Akram
0
已采纳
2015-08-01 13:43:39
我已经稍微修改了您的脚本。 因此,您现在可以做任何您想做的事。 请记住不要将很多物品报废,否则会遇到内存问题。 因此,如果在使用过的网站中存在分页,请为其使用新功能。 在此代码中,我假设您需要每个设备的描述,但您也可以访问其他元素。
注意:您可能知道跨域策略不允许我们使用javascript / jQuery访问iFrame,这将是一个巨大的缺陷。 您必须添加
--web安全=无
在cmd / terminal中执行脚本时标记。
var page = new WebPage(), innerPage = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;
page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished = function() { loadInProgress = false; console.log("load finished"); };
// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';
// Enable/Disable Javascript
//page.settings.javascriptEnabled = false;
//IMPORTANT FLAGS
//--web-security=yes/no
var steps = [
function() { //Load Page
page.open("http://www.example.com/mobiles-apple/", function() {
page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
page.evaluate(function() {
try {
$("#main1").append('<div id="inner-data_iframes"></div>');
for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
var iFrameAdd = document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
$("#inner-data_iframes").append('<iframe id="myIframe' + [i] + '" src="' + iFrameAdd + '"></iframe>');
window.document.body.scrollTop = document.body.scrollHeight;
}
console.log("Mission Successful.");
}
catch(ex) {
console.log("Failed to add iFrame.");
}
});
});
},
function() { //Fetch Products
page.onCallback = function(result) {
var fs = require('fs');
fs.write('product-list.csv', result, 'w+');
};
page.evaluate(function() {
var arr_mainList = new Array();
var arr_innerList = new Array();
try {
for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);
window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
var desc = $("#myIframe" + [i]).contents().find(".item_desc").html();
desc = desc.replace(/,/g, "");
window.callPhantom(desc + ", ");
window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");
arr_innerList.push(arr_mainList[i]);
for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {
if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
}
else {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
}
}
window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
window.callPhantom("\n");
}
loadInProgress = true;
console.log("Successful.");
}
catch(ex) {
console.log("Failed: " + ex);
}
});
}
];
interval = setInterval(function() {
if (!loadInProgress && typeof steps[testindex] == "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
setTimeout(function(){
//fs.write('product-list.html', page.content, 'w');
console.log("test complete!");
phantom.exit();
}, 100);
}
}, 5000);