Github Stargazers 爬虫获取项目粉丝邮箱

欧阳鸿德
2023-12-01

封装请求

需要带上 Github Access Token

/**
 * Module dependencies.
 */
var request = require('superagent');

/**
 * Access token.
 */
 var accessToken = process.env.GITHUB_TOKEN || '';

/**
 * Thunkified GET.
 */
exports.get = function get(uri) {
  return function(fn) {
    request
      .get(uri)
      .set('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36')
      .set('Authorization', 'Bearer ' + accessToken)
      .end(fn);
  };
};

获取粉丝列表和邮箱方法

/**
 * Module dependencies.
 */
var request = require('./request');

/**
 * Get stargazers from repo.
 */
exports.getStargazers = function *(repo) {
  var page = 0;
  var stargazers = [];
  do {
    var url = 'https://api.github.com/repos/' + repo + '/stargazers?per_page=100&page=' + page;
    var res = yield request.get(url);
    var users = res.body;
    for (var i = 0; i < users.length; i++) {
      stargazers.push(users[i].login);
      // I can also get organizations and other stuff.
    }
    if (users.length < 25) break;
    page += 1;
  } while (true);
  return stargazers;
};

/**
 * Get email from github usernane.
 */
exports.getEmail = function *(username) {
  var url = 'https://api.github.com/users/' + username;
  var res = yield request.get(url);
  var user = res.body;
  return {
    username: user.login,
    email: user.email,
    hireable: user.hireable
  };
};

执行脚本

/**
 * Module dependencies.
 */
var repo = process.argv[2] || 'keycloak/keycloak';
var github = require('./lib/github');
var co = require('co');
var fs = require('fs');

/**
 * Do stuff.
 */
co(function *(){
    try{
      var stargazers = yield github.getStargazers(repo);
      fs.writeFileSync('stargazers.txt', JSON.stringify(stargazers,null,2),{encoding:'utf-8'});
      for (var i = 0; i < stargazers.length; i++) {
        try {
          var user = yield github.getEmail(stargazers[i]);
          var line = [user.username, user.email, user.hireable].join(',') + '\n';
          fs.appendFileSync('output.csv', line, { encoding: 'utf8' });
        } catch (e) {
          console.error(e);
        }
      }
    }
    catch(e){
      console.error(e)
    }
});

注意

Github 个人 Token 每小时请求限制为 5000,超限之后等一小时或者换一个号的 Token,然后从文件中读取,从中断下标继续执行。直至跑完。

 类似资料: