使用一个网站链接来按照目录下载存放网站资源，使用资源列表批量下载资源，自动分目录存放

拿我格子衫来

发布于 2024-09-26 09:04:02

1050

发布于 2024-09-26 09:04:02

文章被收录于专栏：TopFE

适用于服务器渲染的页面，一次加载所有的资源

const fs = require('fs')
const path = require('path')
const https = require('https')
const http = require('http')
const url = require('url')
const cheerio = require('cheerio')

const targetUrl = 'https://example.com' // 指定下载的网站链接
const outputDir = './downloaded_resources' // 指定下载的资源存放目录

// 创建目录函数
function createDir(dirPath) {
  if (!fs.existsSync(dirPath)) {
    fs.mkdirSync(dirPath)
  }
}

// 下载文件函数
function downloadFile(fileUrl, filePath) {
  const file = fs.createWriteStream(filePath)
  const protocol = url.parse(fileUrl).protocol === 'https:' ? https : http

  return new Promise((resolve, reject) => {
    protocol
      .get(fileUrl, (response) => {
        if (response.statusCode !== 200) {
          reject(new Error(`Failed to download ${fileUrl}, status code: ${response.statusCode}`))
          return
        }

        response.pipe(file)

        file.on('finish', () => {
          file.close()
          resolve()
        })

        file.on('error', (err) => {
          fs.unlink(filePath)
          reject(err)
        })
      })
      .on('error', (err) => {
        fs.unlink(filePath)
        reject(err)
      })
  })
}

// 下载资源函数
async function downloadResources(url, outputDir) {
  try {
    const html = await new Promise((resolve, reject) => {
      const protocol = url.protocol === 'https:' ? https : http
      protocol
        .get(url, (res) => {
          res.setEncoding('utf8')
          let rawData = ''
          res.on('data', (chunk) => {
            rawData += chunk
          })
          res.on('end', () => {
            resolve(rawData)
          })
        })
        .on('error', (err) => {
          reject(err)
        })
    })

    const $ = cheerio.load(html)

    $('img, script, link[rel="stylesheet"], audio, video, source, object').each(async function () {
      let resourceUrl = $(this).attr('src') || $(this).attr('href') || $(this).attr('data')
      if (!resourceUrl) return

      const parsedUrl = new URL(resourceUrl, url)
      const relativePath = parsedUrl.pathname.slice(1)
      const filePath = path.join(outputDir, relativePath)
      createDir(path.dirname(filePath))

      console.log(`Downloading ${resourceUrl} to ${filePath}...`)
      await downloadFile(parsedUrl.href, filePath)
    })

    console.log(`All resources downloaded to ${outputDir}!`)
  } catch (error) {
    console.error(error)
  }
}

downloadResources(new URL(targetUrl), outputDir)

手动收集资源链接，批量下载到对应的目录

手动获取所有的资源

浏览器控制台执行，自动下载资源链接

;(() => {
  // 获取当前页面所有资源链接
  const getResourceLinks = () => {
    const links = new Set()

    // 获取所有图片链接
    document.querySelectorAll('img').forEach((img) => {
      if (img.src && !img.src.startsWith('blob:')) links.add(img.src)
    })

    // 获取所有视频链接
    document.querySelectorAll('video').forEach((video) => {
      video.querySelectorAll('source').forEach((source) => {
        if (source.src && !source.src.startsWith('blob:')) links.add(source.src)
      })
    })

    // 获取所有音频链接
    document.querySelectorAll('audio').forEach((audio) => {
      audio.querySelectorAll('source').forEach((source) => {
        if (source.src && !source.src.startsWith('blob:')) links.add(source.src)
      })
    })

    // 获取所有样式表链接 (CSS)
    document.querySelectorAll('link[rel="stylesheet"]').forEach((link) => {
      if (link.href && !link.href.startsWith('blob:')) links.add(link.href)
    })

    // 获取所有字体链接 (通过 @font-face 的CSS文件)
    const styleSheets = document.styleSheets
    for (const sheet of styleSheets) {
      try {
        const rules = sheet.cssRules || sheet.rules
        for (const rule of rules) {
          if (rule.style && rule.style.src) {
            const fontUrls = rule.style.src.match(/url\(["']?([^"')]+)["']?\)/g)
            if (fontUrls) {
              fontUrls.forEach((fontUrl) => {
                const url = fontUrl.match(/url\(["']?([^"')]+)["']?\)/)[1]
                if (!url.startsWith('blob:')) links.add(url)
              })
            }
          }
        }
      } catch (error) {
        // 忽略跨域样式表的错误
      }
    }

    // 获取所有脚本链接
    document.querySelectorAll('script').forEach((script) => {
      if (script.src && !script.src.startsWith('blob:')) links.add(script.src)
    })

    // 获取所有背景图片链接
    document.querySelectorAll('*').forEach((el) => {
      const bgImage = getComputedStyle(el).backgroundImage
      if (bgImage && bgImage !== 'none') {
        const urlMatch = bgImage.match(/url\(["']?([^"')]+)["']?\)/)
        if (urlMatch) {
          const url = urlMatch[1]
          if (!url.startsWith('blob:')) links.add(url)
        }
      }
    })

    return Array.from(links)
  }

  // 将资源链接保存为文本文件
  const downloadTextFile = (text, fileName) => {
    const blob = new Blob([text], { type: 'text/plain' })
    const a = document.createElement('a')
    a.href = URL.createObjectURL(blob)
    a.download = fileName
    document.body.appendChild(a)
    a.click()
    document.body.removeChild(a)
  }

  // 调用函数并获取资源链接
  const resources = getResourceLinks()
  console.log('资源链接:', resources)

  // 将资源链接转换为文本并下载为文件
  const fileContent = JSON.stringify(resources)
  downloadTextFile(`const urls = ${fileContent}`, 'resource-links.txt')
})()

获取所有链接资源下载到对应的目录中

将上一步获取的urls替换一下，执行

const https = require('https')
const fs = require('fs')
const path = require('path')
const { URL } = require('url')

// 创建目录（如果不存在的话）
const ensureDirectoryExistence = (filePath) => {
  const dirname = path.dirname(filePath)
  if (!fs.existsSync(dirname)) {
    fs.mkdirSync(dirname, { recursive: true })
  }
}

// 下载文件并保存到指定路径
const downloadFile = (url, dest) => {
  return new Promise((resolve, reject) => {
    const request = https.get(url, (response) => {
      // 处理重定向
      if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
        return downloadFile(response.headers.location, dest).then(resolve).catch(reject)
      }

      // 检查是否请求成功
      if (response.statusCode !== 200) {
        return reject(new Error(`Failed to get '${url}' (${response.statusCode})`))
      }

      // 确保目录存在
      ensureDirectoryExistence(dest)

      const file = fs.createWriteStream(dest)

      // 监听流中的错误
      response.on('error', (err) => {
        fs.unlink(dest, () => reject(err)) // 删除不完整文件
      })

      // 将响应流写入文件
      response.pipe(file)

      // 确保文件写入完成后关闭文件
      file.on('finish', () => {
        file.close(() => resolve(dest)) // 关闭文件并完成Promise
      })

      // 处理文件流错误
      file.on('error', (err) => {
        fs.unlink(dest, () => reject(err)) // 删除未完成的文件
      })
    })

    // 请求错误处理
    request.on('error', (err) => {
      reject(err)
    })
  })
}

// 主函数处理 URL 数组
const downloadResources = async (urls, directory) => {
  for (const url of urls) {
    try {
      const urlObj = new URL(url)
      const relativePath = urlObj.pathname
      const destPath = path.join(directory, relativePath)

      // 下载并保存文件
      await downloadFile(url, destPath)
      console.log(`Downloaded: ${url} -> ${destPath}`)
    } catch (error) {
      console.error(`Failed to download ${url}:`, error)
    }
  }
}

// 示例 URL 数组和目录
const urls = [
  'https://profile-avatar.csdnimg.cn/a543dcdeef584c9f855695e5a65600ea_github_35631540.jpg',
  'https://i-blog.csdnimg.cn/direct/6d71ec4ddf7c47eca0fee08caec7bcd5.png'
]
const targetDirectory = './test'

// 开始下载
downloadResources(urls, targetDirectory)
  .then(() => {
    console.log('All downloads completed.')
  })
  .catch((error) => {
    console.error('Error downloading resources:', error)
  })

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2024-09-20，如有侵权请联系 cloudcommunity@tencent.com 删除

函数