parse_metadata.js 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. const fs = require("fs-extra");
  2. const path = require("path");
  3. const axios = require("axios");
  4. const { imageSizeFromFile } = require("image-size/fromFile");
  5. // 配置路径
  6. const OUTPUT_FILE = "image_metadata.json";
  7. const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
  8. /**
  9. * 处理 Adobe Stock 图片 (保留网络请求以获取 tags)
  10. */
  11. async function processAdobeStock(imageId, html, finalUrl) {
  12. // 1. 清理 URL 中的地区代码 (如 /hk/)
  13. const cleanUrl = finalUrl.replace(
  14. /stock\.adobe\.com\/[a-z]{2}\//i,
  15. "stock.adobe.com/",
  16. );
  17. // 2. 提取 Keywords
  18. let tags = [];
  19. const keywordsRegex = /"keywords":\s*(\[.*?\])/;
  20. const match = html.match(keywordsRegex);
  21. if (match && match[1]) {
  22. try {
  23. tags = JSON.parse(match[1]);
  24. } catch (e) {
  25. console.error(`解析 Adobe Keywords JSON 失败: ${imageId}`);
  26. }
  27. }
  28. // 3. 数据清洗并取前 5 个
  29. tags = [...new Set(tags.map((t) => t.trim()).filter((t) => t))].slice(0, 5);
  30. return { from: cleanUrl, tags: tags };
  31. }
  32. /**
  33. * 主解析函数
  34. */
  35. async function parseImages(imageDir) {
  36. if (!imageDir) return;
  37. try {
  38. const files = await fs.readdir(imageDir);
  39. const results = [];
  40. // 支持 jpeg/jpg
  41. const imageFiles = files.filter((f) => /\.(jpe?g)$/i.test(f));
  42. console.log(`找到 ${imageFiles.length} 张图片,准备开始解析...`);
  43. for (const filename of imageFiles) {
  44. let imageId, source;
  45. // 1. 判定来源
  46. if (filename.startsWith("AdobeStock_")) {
  47. imageId = filename.match(/AdobeStock_(\d+)/)?.[1];
  48. source = "AdobeStock";
  49. } else if (filename.startsWith("shutterstock_")) {
  50. imageId = filename.match(/shutterstock_(\d+)/)?.[1];
  51. source = "Shutterstock";
  52. }
  53. if (!imageId) {
  54. console.warn(`跳过未知格式文件: ${filename}`);
  55. continue;
  56. }
  57. const filePath = path.join(imageDir, filename);
  58. console.log(`正在处理 [${source}] ID: ${imageId} ...`);
  59. try {
  60. // 2. 获取本地图片尺寸 (无论哪种来源都需要)
  61. const dimensions = await imageSizeFromFile(filePath);
  62. // 3. 分支处理
  63. if (source === "Shutterstock") {
  64. // --- 改进点:Shutterstock 直接静态生成数据,不发请求 ---
  65. results.push({
  66. id: imageId,
  67. source: source,
  68. filename: filename,
  69. localPath: filePath,
  70. from: `https://www.shutterstock.com/image-photo/${imageId}`, // 静态构造
  71. tags: [],
  72. width: dimensions.width,
  73. height: dimensions.height,
  74. });
  75. console.log(
  76. `✅ 成功 (静态构造): ${imageId} | URL: https://www.shutterstock.com/image-photo/${imageId}`,
  77. );
  78. // 注意:Shutterstock 无需请求,不需要 sleep
  79. continue;
  80. }
  81. if (source === "AdobeStock") {
  82. // AdobeStock 依然需要请求网络以获取标签
  83. const targetUrl = `https://stock.adobe.com/${imageId}`;
  84. let meta = { from: targetUrl, tags: [] };
  85. try {
  86. const response = await axios.get(targetUrl, {
  87. headers: {
  88. "User-Agent":
  89. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  90. },
  91. timeout: 10000,
  92. });
  93. meta = await processAdobeStock(
  94. imageId,
  95. response.data,
  96. response.request.res.responseUrl || targetUrl,
  97. );
  98. console.log(`✅ 成功: ${imageId} | Tags: [${meta.tags.join(", ")}]`);
  99. } catch (parseErr) {
  100. console.warn(`⚠️ Adobe 解析失败,使用默认值: ${imageId} | ${parseErr.message}`);
  101. }
  102. results.push({
  103. id: imageId,
  104. source: source,
  105. filename: filename,
  106. localPath: filePath,
  107. from: meta.from,
  108. tags: meta.tags,
  109. width: dimensions.width,
  110. height: dimensions.height,
  111. });
  112. // Adobe 请求后执行休眠,防止请求过快
  113. await sleep(2000);
  114. }
  115. } catch (err) {
  116. console.error(`❌ 处理 ${filename} 失败: ${err.message}`);
  117. }
  118. }
  119. // 4. 输出 JSON
  120. await fs.writeJson(OUTPUT_FILE, results, { spaces: 2 });
  121. console.log(`\n🎉 全部完成!元数据已保存至 ${OUTPUT_FILE}`);
  122. } catch (err) {
  123. console.error("程序运行出错:", err);
  124. }
  125. }
  126. // 导出
  127. module.exports = { parseImages };
  128. // 脚本测试入口
  129. if (require.main === module) {
  130. // 获取命令行参数或使用默认路径
  131. const testDir =
  132. process.argv[2] || "/Users/guoziyun/content/picture-jigasw/shuttshock/test";
  133. parseImages(testDir);
  134. }