如何在Javascript中添加服务器端延迟for循环？

Z时代
2024-01-10
分类：问答

我在摆弄使用Node.js从电子商务网站刮取数据。我使用Request来检索页面的DOM，并使用Cheerio来执行服务器端DOM选择。如何在Javascript中添加服务器端延迟for循环？

const cheerio = require('cheerio'); 
const request = require('request'); 
// takes a URL, scrapes the page, and returns an object with the data 
let scrapePage = (url) => { 
    return new Promise((resolve, reject) => { 
     request(url, (error, resp, body) => { 
      if(error){ 
       reject(error); 
      }; 
      let $ = cheerio.load(body); 
      let $url = url; 
      let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text(); 
      let obj = { 
       url: $url, 
       price: $price 
      } 
      resolve(obj); 
     }); 
    }); 
}; 
// Runs scrapePage in a loop 
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs 
for(let i = 0; i < arrayOfURLs.length; i++){ 
    scrapePage(arrayOfURLs[i]) 
     .then((obj) => { 
      //write to a file 
     }) 
     .catch((error) => { 
     }) 
};

的问题是，我将请求发送到服务器有时会发回空白数据，我假设，因为我发送太多的请求，没有任何形式的停顿。由于JS的异步特性，我很难弄清楚如何在循环的每次迭代之间添加有效的延迟。仅仅以同步的方式添加setTimeOut是不够的，因为setTimeOut本身是异步的，并且我在服务器上运行它，所以没有Window对象。

编辑

上面的代码是什么我工作的简化版本。整个代码是这样的：

app.js

const fs = require('fs'); 
const path = 'urls.txt'; 
const path2 = 'results.txt'; 
const scraper = require('./scraper'); 
let scrapePage = (url) => { 
    scraper.scrapePage(url) 
     .then((obj) => { 
      // console.log('obj from the scraper with Promises was received'); 
      // console.log(obj); 
      // console.log('writing obj to a file'); 
      fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => { 
       if(error){ 
        console.log(error); 
       } else { 
        // console.log('Successfully wrote to ' + path2); 
       } 
      }) 
     }) 
     .catch((error) => { 
      console.log('There was an error scraping obj: '); 
      console.log(error); 
     }) 
} 
fs.readFile(path, 'utf8', (err, data) => { 
    if (err){ 
    throw err; 
    }; 
    var urlArray = JSON.parse(data); 
    // this returns an Unexpected Identifier error  
    // const results = await Promise.all(urlArray.map(scrapePage)); 
    // this returns an Unexpected Token Function error 
    // async function scrapePages(){ 
    // const results = await Promise.all(urlArray.map(scrapePage)); 
    // }; 
});

scraper.js

const request = require('request'); 
const cheerio = require('cheerio'); 
exports.scrapePage = (url) => { 
    return new Promise((resolve, reject) => { 
     request(url, (error, resp, body) => { 
      if(error){ 
       reject(error); 
      }; 
      let $ = cheerio.load(body); 
      let $url = url; 
      let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text(); 
      let obj = { 
       url: $url, 
       price: $price 
      } 
      resolve(obj); 
     }) 
    }) 
}

回答：

看起来像你对我没有等待你的承诺，以解决您发送服务器响应之前。您可以使用例如async/await例如完全消除for循环。

const results = await Promise.all(arrayOfURLs.map(scrapePage));

回答：

如果您希望不超过x个活动连接数，您可以使用throttle。或者，如果您希望每秒不超过x数量，则可以使用throttlePeriod。

使用Promise.all绝不会叫你的决心处理程序，如果只有一个请求失败，所以你可以捕捉任何错误，并返回一个失败对象

const Fail = function(details){this.details=details;}; 
const max10 = throttle(10)(scrapePage);//max 10 active connections 
//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second 
Promise.all(
    arrayOfURLs.map(
    url => 
     max10(url) 
     .catch(err=>new Fail([err,url])) 
) 
) 
.then(
    results =>{ 
    successes = 
     results.filter(
     result=>(result&&result.constructor)!==Fail 
    ); 
    failed = 
     results.filter(
     result=>(result&&result.constructor)===Fail 
    ) 
    } 
);

回答：

const cheerio = require('cheerio'); 
const request = require('request'); 
let scrapePage = (url) => { 
return new Promise((resolve, reject) => { 
    request(url, (error, resp, body) => { 
     if(error){ 
      reject(error); 
      return; 
     }; 
     if(!body) { 
      reject('Empty Body'); 
      return; 
     } 
     let $ = cheerio.load(body); 
     let $url = url; 
     let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text(); 
     let obj = { 
      url: $url, 
      price: $price 
     } 
     resolve(obj); 
    }); 
}); 
}; 
function processUrl(url){ 
scrapePage(url) 
    .then((obj) => { 
     //write to a file 
     if(i < arrayOfURLs.length) 
      processUrl(arrayOfURLs.pop()) 
    }) 
    .catch((error) => { 
     arrayOfURLs.unshift(url); 
     if(i < arrayOfURLs.length) // put this in finally block 
      processUrl(arrayOfURLs.pop()) 
    }) 
}; 
processUrl(arrayOfURLs.pop());