#!/usr/bin/env bash
set -euo pipefail

# Usage:
#   ./php_dedupe_audit.sh apps_activas.txt [OUTDIR]
#
# Input file format (apps_activas.txt):
#   /var/www/user2/app1
#   /var/www/user3/app2
#   ...

LIST_FILE="${1:-}"
OUTDIR="${2:-php_dedupe_out}"

if [[ -z "${LIST_FILE}" || ! -f "${LIST_FILE}" ]]; then
  echo "ERROR: Provide a valid list file. Example: ./php_dedupe_audit.sh apps_activas.txt [OUTDIR]" >&2
  exit 1
fi

mkdir -p "${OUTDIR}"

PHP_FILES="${OUTDIR}/php_files.txt"
HASHES_TSV="${OUTDIR}/php_hashes.tsv"
COUNTS_TSV="${OUTDIR}/hash_counts.tsv"
UNIQUE_FILES_TSV="${OUTDIR}/unique_files.tsv"
TOP_REUSE_TSV="${OUTDIR}/top_reused_hashes.tsv"

: > "${PHP_FILES}"
: > "${HASHES_TSV}"

echo "==> Collecting PHP files from app directories in: ${LIST_FILE}"
# Read list, ignore blank lines and comments starting with #
while IFS= read -r APP || [[ -n "${APP}" ]]; do
  APP="${APP#"${APP%%[![:space:]]*}"}"  # ltrim
  APP="${APP%"${APP##*[![:space:]]}"}"  # rtrim

  [[ -z "${APP}" ]] && continue
  [[ "${APP}" =~ ^# ]] && continue

  if [[ ! -d "${APP}" ]]; then
    echo "WARN: Not a directory, skipping: ${APP}" >&2
    continue
  fi

  # Collect .php files
  find "${APP}" -type f -name '*.php' -print >> "${PHP_FILES}"
done < "${LIST_FILE}"

TOTAL_FILES=$(wc -l < "${PHP_FILES}" | tr -d ' ')
echo "==> Total PHP files found: ${TOTAL_FILES}"

if [[ "${TOTAL_FILES}" -eq 0 ]]; then
  echo "Nothing to do (no PHP files found)."
  exit 0
fi

echo "==> Hashing (normalized content: remove comments, normalize whitespace) ..."
# Hash each file via php-cli (token-based comment removal)
# Output: hash<TAB>filepath
while IFS= read -r FILE || [[ -n "${FILE}" ]]; do
  # Guard in case file disappeared
  [[ -f "${FILE}" ]] || continue

  HASH=$(php -r '
    $path = $argv[1];
    $code = @file_get_contents($path);
    if ($code === false) { fwrite(STDERR, "READ_FAIL\t$path\n"); exit(2); }

    $tokens = token_get_all($code);
    $clean = "";

    foreach ($tokens as $t) {
      if (is_array($t)) {
        if ($t[0] === T_COMMENT || $t[0] === T_DOC_COMMENT) continue;
        $clean .= $t[1];
      } else {
        $clean .= $t;
      }
    }

    // Normalize whitespace (keep semantics stable enough for dedupe)
    $clean = preg_replace("/\s+/", " ", $clean);
    $clean = trim($clean);

    echo hash("sha256", $clean);
  ' "${FILE}" 2>/dev/null || true)

  if [[ -z "${HASH}" ]]; then
    # If php failed, fall back to raw sha256 of file to avoid losing record
    HASH=$(sha256sum "${FILE}" | awk '{print $1}')
  fi

  printf "%s\t%s\n" "${HASH}" "${FILE}" >> "${HASHES_TSV}"
done < "${PHP_FILES}"

echo "==> Computing uniqueness and reuse stats ..."
# Count occurrences per hash
cut -f1 "${HASHES_TSV}" \
  | sort \
  | uniq -c \
  | awk '{print $2"\t"$1}' \
  > "${COUNTS_TSV}"

UNIQUE_HASHES=$(wc -l < "${COUNTS_TSV}" | tr -d ' ')
DUPLICATE_FILES=$(( TOTAL_FILES - UNIQUE_HASHES ))

echo "==> Total PHP files:            ${TOTAL_FILES}"
echo "==> Unique PHP files (content): ${UNIQUE_HASHES}"
echo "==> Duplicate PHP files:        ${DUPLICATE_FILES}"

# Top reused hashes
# Format: count<TAB>hash
sort -k2,2nr "${COUNTS_TSV}" \
  | head -50 \
  | awk -F'\t' '{print $2"\t"$1}' \
  > "${TOP_REUSE_TSV}"

echo "==> Top reused hashes written to: ${TOP_REUSE_TSV}"

# List unique (hash appears once): hash<TAB>file
# Build a set of hashes with count==1, then filter hashes.tsv
awk -F'\t' '$2==1{print $1}' "${COUNTS_TSV}" \
  | sort -u \
  > "${OUTDIR}/unique_hashes.txt"

# Filter hashes list to only those unique hashes
# (Uses grep -F with a file list; for huge sets, awk join is better, but this is OK usually)
grep -Ff "${OUTDIR}/unique_hashes.txt" "${HASHES_TSV}" \
  > "${UNIQUE_FILES_TSV}" || true

UNIQUE_FILES_COUNT=$(wc -l < "${UNIQUE_FILES_TSV}" | tr -d ' ')
echo "==> Unique-by-content files (count): ${UNIQUE_FILES_COUNT}"
echo "==> Output files:"
echo "    - ${PHP_FILES}"
echo "    - ${HASHES_TSV}"
echo "    - ${COUNTS_TSV}"
echo "    - ${TOP_REUSE_TSV}"
echo "    - ${UNIQUE_FILES_TSV}"

echo "==> Done."

