what is going on

This commit is contained in:
Frank Delaguila
2022-10-04 20:46:30 -06:00
commit 001e362d38
14 changed files with 8399 additions and 0 deletions

136
scraper.js Normal file
View File

@@ -0,0 +1,136 @@
const puppeteer = require('puppeteer-extra');
const { writeFile } = require('fs').promises;
/*************************************
* THIS IS WHAT MAKES IT WORK
* SHOUT OUT JORDAN HANSEN - https://cobaltintelligence.com/blog/avoid-being-blocked-with-puppeteer/
* ALSO ANOTHER WAY TO BLOCK ALL ADS IS TO INTERCEPT ALL REQUESTS, AND FILTER ALL REQUESTS FOR THESE DOMAINS:
* https://winhelp2002.mvps.org/hosts.txt
* Example Code:
* //now we read the host file
var hostFile = fs.readFileSync('hosts.txt', 'utf8').split('\n');
var hosts = {};
for (var i = 0; i < hostFile.length; i++) {
var frags = hostFile[i].split(' ');
if (frags.length > 1 && frags[0] === '0.0.0.0') {
hosts[frags[1].trim()] = true;
}
}
* page.on('request', request => {
var domain = null;
if (task.input.blockads) {
var frags = request.url().split('/');
if (frags.length > 2) {
domain = frags[2];
}
}
if ((task.input.blockads && hosts[domain] === true) || (!task.input.includephotos && request.resourceType() === 'image')) {
request.abort();
}
else {
request.continue();
}
});
*************************************/
// const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// puppeteer.use(StealthPlugin());
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
/************************************/
puppeteer
.use(AdblockerPlugin({blockTrackers: true}))
.launch({headless: false})
.then( async browser => {
const schedulePage = await browser.newPage();
await schedulePage.goto('https://www.sports-reference.com/cfb/schools/utah/2022-schedule.html');
const previousYearStatisticsPage = await browser.newPage();
await previousYearStatisticsPage.goto('https://www.sports-reference.com/cfb/schools/utah/2021.html');
const currentYearStatisticsPage = await browser.newPage();
await currentYearStatisticsPage.goto('https://www.sports-reference.com/cfb/schools/utah/2022.html');
// START BUILDING THE DATA PER PAGE
// Previous years statistics
const previousYearStatistics = await previousYearStatisticsPage.evaluate( () => {
let team = {
2021: {}
};
let passCompletion = document.querySelector( '.stats_table tr[data-row="0"] td[data-stat="pass_cmp"]' );
let passAttempts = document.querySelector( '.stats_table tr[data-row="0"] td[data-stat="pass_att"]' );
team[2021].passCompletions = Number(passCompletion.textContent);
team[2021].passAttempts = Number(passAttempts.textContent);
return team;
} );
// Current years statistics
const currentYearStatistics = await currentYearStatisticsPage.evaluate( () => {
let team = {
2022: {}
};
let passCompletion = document.querySelector( '.stats_table tr[data-row="0"] td[data-stat="pass_cmp"]' );
let passAttempts = document.querySelector( '.stats_table tr[data-row="0"] td[data-stat="pass_att"]' );
team[2022].passCompletions = Number(passCompletion.textContent);
team[2022].passAttempts = Number(passAttempts.textContent);
return team;
} );
const scheduleResults = await schedulePage.evaluate( (() => {
// Initialize data object
let team = {
games: []
};
// Grab HTML Nodes from the document
let dates = document.querySelectorAll( '#schedule td[data-stat="date_game"]' );
let opponents = document.querySelectorAll( '#schedule td[data-stat="opp_name"]' );
let utah_score = document.querySelectorAll( '#schedule td[data-stat="points"]' );
let opponent_points = document.querySelectorAll( '#schedule td[data-stat="opp_points"]' );
// Loop through the HTML Nodes, and push the textContent to the Team Games array!
// Example:
// "Florida": {
// games: [
// { date: 'Sep 3, 2022', utah_score: 26, opponent_points: 29 }
// ...
// ]
// }
dates.forEach( (date, index) => {
let key = opponents[index].textContent.replace( /([([)0-9^\s])/g, '' ).toLowerCase().trim();
team.games.push({
opponent: key,
date: date.textContent,
utah_score: utah_score[index].textContent !== '' ? Number(utah_score[index].textContent) : 0,
opponent_points: opponent_points[index].textContent !== '' ? Number(opponent_points[index].textContent) : 0
}
);
} );
// Finally return the data!
return team;
}) );
// END BUILDING THE DATA PER PAGE
// Convert data to JSON, and write the file!
// NOTE: NEEDED TO CHANGE FILE TYPE TO .JS TO BE ABLE TO USE IT LOCALLY FOR CHARTS
// I COULD SPIN UP AN EXPRESS SERVER, AND SERVE THAT JSON FILE ON AN API ENDPOINT OR JUST MAKE ENDPOINTS TO RETURN THIS DATA
scheduleResults[2021] = previousYearStatistics['2021'];
scheduleResults[2022] = currentYearStatistics['2022'];
let data = JSON.stringify( scheduleResults, null, 2 );
console.log( data );
writeFile('db.js', `const dbData = ${data}`, 'utf8');
await browser.close();
} )
.catch( (err) => console.log(err) );