1
0
forked from Mike/lab
lab/max/max-scraper.py
2025-05-15 07:23:45 +00:00

208 lines
7.4 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
import random
import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
SEEN_LINKS_FILE = "seen_links.txt"
GENRE_URLS = [
"https://play.max.com/genre/action",
"https://play.max.com/genre/comedy",
"https://play.max.com/genre/drama",
"https://play.max.com/genre/horror",
"https://play.max.com/genre/sci-fi",
"https://play.max.com/genre/documentary"
]
def setup_browser():
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-blink-features=AutomationControlled")
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def scroll_to_bottom(driver, max_wait=60):
print("\U0001f4dc Scrolling to load all movies...")
last_height = driver.execute_script("return document.body.scrollHeight")
start_time = time.time()
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
print("\u2705 Reached bottom of page.")
break
last_height = new_height
if time.time() - start_time > max_wait:
print("\u23f1\ufe0f Scroll timed out.")
break
def extract_movie_links(driver):
anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]')
links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")]))
return links
def generate_random_query():
letters = string.ascii_lowercase
return ''.join(random.choices(letters, k=random.choice([1, 2])))
def load_seen_links():
if not os.path.exists(SEEN_LINKS_FILE):
return set()
with open(SEEN_LINKS_FILE, "r") as f:
return set([line.strip() for line in f if line.strip()])
def save_seen_links(new_links):
with open(SEEN_LINKS_FILE, "a") as f:
for link in new_links:
f.write(link + "\n")
def save_links(links, batch_id, prefix):
output_dir = "output_movies"
os.makedirs(output_dir, exist_ok=True)
for i in range(0, len(links), 10):
chunk = links[i:i+10]
file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt")
with open(file_path, "w") as f:
for link in chunk:
f.write(link + "\n")
print(f"\U0001f4be Saved {len(chunk)} links to {file_path}")
def run_movies_mode(driver, seen_links, batch_counter):
driver.get("https://play.max.com/movies")
time.sleep(4)
scroll_to_bottom(driver)
all_links = extract_movie_links(driver)
print(f"\u2705 Loaded {len(all_links)} movie links.")
while True:
try:
count = int(input("\U0001f522 How many random movies do you want to extract? "))
break
except ValueError:
print("\u274c Invalid number.")
sample = random.sample(all_links, min(count, len(all_links)))
unique_links = [link for link in sample if link not in seen_links]
duplicates = len(sample) - len(unique_links)
if unique_links:
save_links(unique_links, batch_counter, "movies")
save_seen_links(unique_links)
print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
else:
print("\u26a0\ufe0f All selected links were duplicates.")
def run_search_mode(driver, seen_links, search_counter):
search_terms = [
"the", "man", "love", "dark", "moon", "fire", "red", "blue", "night",
"girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen"
]
max_attempts = 10
found_total = 0
for attempt in range(max_attempts):
query = random.choice(search_terms)
url = f"https://play.max.com/search?q={query}"
print(f"\n\U0001f50d Searching for query: '{query}'")
driver.get(url)
time.sleep(4)
links = extract_movie_links(driver)
unique_links = [link for link in links if link not in seen_links]
duplicates = len(links) - len(unique_links)
if unique_links:
save_links(unique_links, search_counter, "search")
save_seen_links(unique_links)
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
search_counter += 1
found_total += len(unique_links)
else:
print(f"\u26a0\ufe0f No new links found for query '{query}'.")
print("\u23f3 Waiting 10 seconds before next search...")
time.sleep(10)
print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.")
def run_genre_mode(driver, seen_links, genre_batch):
found_total = 0
for genre_url in GENRE_URLS:
print(f"\n\U0001f3ac Loading genre: {genre_url}")
driver.get(genre_url)
time.sleep(4)
scroll_to_bottom(driver)
links = extract_movie_links(driver)
unique_links = [link for link in links if link not in seen_links]
duplicates = len(links) - len(unique_links)
if unique_links:
save_links(unique_links, genre_batch, "genre")
save_seen_links(unique_links)
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
genre_batch += 1
found_total += len(unique_links)
else:
print("\u26a0\ufe0f No new unique links in this genre.")
time.sleep(5)
print("📊 Genre scan complete. Found total", found_total, "new movie links.")
def main():
print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project")
driver = setup_browser()
try:
print("\U0001f310 Opening Max.com homepage...")
driver.get("https://play.max.com")
time.sleep(3)
print("\n\U0001f511 Please log in manually in the browser.")
input("\u23f3 After you're logged in, press ENTER here to continue...")
seen_links = load_seen_links()
movie_batch = 1
search_batch = 1
genre_batch = 1
while True:
print("📋 === MAIN MENU ===")
print("1⃣ Extract movies from /movies page")
print("2⃣ Extract random movies via search")
print("3⃣ Extract movies by genre")
print("4⃣ Exit")
choice = input("Enter your choice: ").strip()
if choice == "1":
run_movies_mode(driver, seen_links, movie_batch)
movie_batch += 1
elif choice == "2":
run_search_mode(driver, seen_links, search_batch)
search_batch += 1
elif choice == "3":
run_genre_mode(driver, seen_links, genre_batch)
genre_batch += 1
elif choice == "4":
break
else:
print("\u274c Invalid choice.")
finally:
again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower()
if again != "yes":
main()
else:
input("✅ Press ENTER to exit.")
driver.quit()
if __name__ == "__main__":
main()